summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKornel <kornel@geekhood.net>2020-03-03 00:00:15 +0000
committerKornel <kornel@geekhood.net>2020-03-10 00:43:09 +0000
commitf28189567a12c9f289eb4c678c485047f73a5be5 (patch)
tree5119dab55c4e9772592fb0bcaeffb34560ba8fff
parentb0b1436e5e8ffa48f7fa2c4c030d827591420854 (diff)
fmt
-rw-r--r--feat_extractor/src/lib.rs99
1 files changed, 49 insertions, 50 deletions
diff --git a/feat_extractor/src/lib.rs b/feat_extractor/src/lib.rs
index fae0c4a..1cd8150 100644
--- a/feat_extractor/src/lib.rs
+++ b/feat_extractor/src/lib.rs
@@ -2,7 +2,6 @@ use std::collections::HashSet;
use rich_crate::ManifestExt;
use rich_crate::Manifest;
-
lazy_static::lazy_static! {
/// ignore these as keywords
pub(crate) static ref STOPWORDS: HashSet<&'static str> = [
@@ -35,60 +34,60 @@ lazy_static::lazy_static! {
].iter().copied().collect();
}
- // returns an array of lowercase phrases
- fn extract_text_phrases(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f64, String)> {
- let mut out = Vec::new();
- let mut len = 0;
- if let Some(s) = &manifest.package().description {
- let s = s.to_lowercase();
- len += s.len();
- out.push((1., s));
- }
- if let Some(s) = github_description {
- let s = s.to_lowercase();
- len += s.len();
- out.push((1., s));
- }
- if let Some(sub) = &readme_text {
- // render readme to DOM, extract nodes
- for par in sub.split('\n') {
- if len > 200 {
- break;
- }
- let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-');
- let par = par.replace("http://", " ").replace("https://", " ");
- if !par.is_empty() {
- let par = par.to_lowercase();
- len += par.len();
- out.push((0.4, par));
- }
+// returns an array of lowercase phrases
+fn extract_text_phrases(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f64, String)> {
+ let mut out = Vec::new();
+ let mut len = 0;
+ if let Some(s) = &manifest.package().description {
+ let s = s.to_lowercase();
+ len += s.len();
+ out.push((1., s));
+ }
+ if let Some(s) = github_description {
+ let s = s.to_lowercase();
+ len += s.len();
+ out.push((1., s));
+ }
+ if let Some(sub) = &readme_text {
+ // render readme to DOM, extract nodes
+ for par in sub.split('\n') {
+ if len > 200 {
+ break;
+ }
+ let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-');
+ let par = par.replace("http://", " ").replace("https://", " ");
+ if !par.is_empty() {
+ let par = par.to_lowercase();
+ len += par.len();
+ out.push((0.4, par));
}
}
- out
}
+ out
+}
- pub fn auto_keywords(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f32, String)> {
- let d = extract_text_phrases(manifest, github_description, readme_text);
- let mut sw = rake::StopWords::new();
- sw.reserve(STOPWORDS.len());
- sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine
- // normalize space and _ to -
- let r = rake::Rake::new(sw);
- let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str()));
- let rake_keywords = rake_keywords.iter()
- .map(|k| (
- k.score.min(1.1), //
- chop3words(k.keyword.as_str()) // rake generates very long setences sometimes
- ));
- // split on / and punctuation too
- let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s"))))
- .filter(|&(_, k)| k.len() >= 2)
- .filter(|&(_, k)| STOPWORDS.get(k).is_none());
+pub fn auto_keywords(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f32, String)> {
+ let d = extract_text_phrases(manifest, github_description, readme_text);
+ let mut sw = rake::StopWords::new();
+ sw.reserve(STOPWORDS.len());
+ sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine
+ // normalize space and _ to -
+ let r = rake::Rake::new(sw);
+ let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str()));
+ let rake_keywords = rake_keywords.iter()
+ .map(|k| (
+ k.score.min(1.1), //
+ chop3words(k.keyword.as_str()) // rake generates very long setences sometimes
+ ));
+ // split on / and punctuation too
+ let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s"))))
+ .filter(|&(_, k)| k.len() >= 2)
+ .filter(|&(_, k)| STOPWORDS.get(k).is_none());
- // replace ' ' with '-'
- // keep if 3 words or less
- rake_keywords.chain(keywords).take(25).map(|(w, s)| (w as f32, s.to_owned())).collect()
- }
+ // replace ' ' with '-'
+ // keep if 3 words or less
+ rake_keywords.chain(keywords).take(25).map(|(w, s)| (w as f32, s.to_owned())).collect()
+}
fn chop3words(s: &str) -> &str {
let mut words = 0;