diff options
author | Kornel <kornel@geekhood.net> | 2020-03-03 00:00:15 +0000 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2020-03-10 00:43:09 +0000 |
commit | f28189567a12c9f289eb4c678c485047f73a5be5 (patch) | |
tree | 5119dab55c4e9772592fb0bcaeffb34560ba8fff | |
parent | b0b1436e5e8ffa48f7fa2c4c030d827591420854 (diff) |
fmt
-rw-r--r-- | feat_extractor/src/lib.rs | 99 |
1 files changed, 49 insertions, 50 deletions
diff --git a/feat_extractor/src/lib.rs b/feat_extractor/src/lib.rs index fae0c4a..1cd8150 100644 --- a/feat_extractor/src/lib.rs +++ b/feat_extractor/src/lib.rs @@ -2,7 +2,6 @@ use std::collections::HashSet; use rich_crate::ManifestExt; use rich_crate::Manifest; - lazy_static::lazy_static! { /// ignore these as keywords pub(crate) static ref STOPWORDS: HashSet<&'static str> = [ @@ -35,60 +34,60 @@ lazy_static::lazy_static! { ].iter().copied().collect(); } - // returns an array of lowercase phrases - fn extract_text_phrases(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f64, String)> { - let mut out = Vec::new(); - let mut len = 0; - if let Some(s) = &manifest.package().description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(s) = github_description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(sub) = &readme_text { - // render readme to DOM, extract nodes - for par in sub.split('\n') { - if len > 200 { - break; - } - let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-'); - let par = par.replace("http://", " ").replace("https://", " "); - if !par.is_empty() { - let par = par.to_lowercase(); - len += par.len(); - out.push((0.4, par)); - } +// returns an array of lowercase phrases +fn extract_text_phrases(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f64, String)> { + let mut out = Vec::new(); + let mut len = 0; + if let Some(s) = &manifest.package().description { + let s = s.to_lowercase(); + len += s.len(); + out.push((1., s)); + } + if let Some(s) = github_description { + let s = s.to_lowercase(); + len += s.len(); + out.push((1., s)); + } + if let Some(sub) = &readme_text { + // render readme to DOM, extract nodes + for par in sub.split('\n') { + if len > 200 { + break; + } + let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-'); + let par = par.replace("http://", " ").replace("https://", " "); + if !par.is_empty() { + let par = par.to_lowercase(); + len += par.len(); + out.push((0.4, par)); } } - out } + out +} - pub fn auto_keywords(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f32, String)> { - let d = extract_text_phrases(manifest, github_description, readme_text); - let mut sw = rake::StopWords::new(); - sw.reserve(STOPWORDS.len()); - sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine - // normalize space and _ to - - let r = rake::Rake::new(sw); - let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str())); - let rake_keywords = rake_keywords.iter() - .map(|k| ( - k.score.min(1.1), // - chop3words(k.keyword.as_str()) // rake generates very long setences sometimes - )); - // split on / and punctuation too - let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s")))) - .filter(|&(_, k)| k.len() >= 2) - .filter(|&(_, k)| STOPWORDS.get(k).is_none()); +pub fn auto_keywords(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f32, String)> { + let d = extract_text_phrases(manifest, github_description, readme_text); + let mut sw = rake::StopWords::new(); + sw.reserve(STOPWORDS.len()); + sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine + // normalize space and _ to - + let r = rake::Rake::new(sw); + let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str())); + let rake_keywords = rake_keywords.iter() + .map(|k| ( + k.score.min(1.1), // + chop3words(k.keyword.as_str()) // rake generates very long setences sometimes + )); + // split on / and punctuation too + let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s")))) + .filter(|&(_, k)| k.len() >= 2) + .filter(|&(_, k)| STOPWORDS.get(k).is_none()); - // replace ' ' with '-' - // keep if 3 words or less - rake_keywords.chain(keywords).take(25).map(|(w, s)| (w as f32, s.to_owned())).collect() - } + // replace ' ' with '-' + // keep if 3 words or less + rake_keywords.chain(keywords).take(25).map(|(w, s)| (w as f32, s.to_owned())).collect() +} fn chop3words(s: &str) -> &str { let mut words = 0; |