diff options
author | Kornel <kornel@geekhood.net> | 2020-03-02 23:59:03 +0000 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2020-03-03 19:24:32 +0000 |
commit | 87af4966975e6b70b176f9fe75877ba99b7a6264 (patch) | |
tree | 47c622b98d9533e0d1891ed5ebe044a1b4e4045b /crate_db/src/lib_crate_db.rs | |
parent | 8d792a73a64b92b298872c1726013b19205bab53 (diff) |
Extract keyword extraction
Diffstat (limited to 'crate_db/src/lib_crate_db.rs')
-rw-r--r-- | crate_db/src/lib_crate_db.rs | 75 |
1 files changed, 5 insertions, 70 deletions
diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs index 725df95..857bbf5 100644 --- a/crate_db/src/lib_crate_db.rs +++ b/crate_db/src/lib_crate_db.rs @@ -52,7 +52,7 @@ pub struct CrateVersionData<'a> { pub authors: &'a [rich_crate::Author], pub category_slugs: &'a [Cow<'a, str>], pub repository: Option<&'a Repo>, - pub readme_text: Option<String>, + pub extracted_auto_keywords: Vec<(f32, String)>, } impl CrateDb { @@ -258,30 +258,9 @@ impl CrateDb { // add nonsense keywords if applied to freeform text insert_keyword.add_synonyms(&self.tag_synonyms); - { - let d = Self::extract_text_phrases(&c); - let mut sw = rake::StopWords::new(); - sw.reserve(STOPWORDS.len()); - sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine - // normalize space and _ to - - let r = rake::Rake::new(sw); - let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str())); - let rake_keywords = rake_keywords.iter() - .map(|k| ( - k.score.min(1.1), // - chop3words(k.keyword.as_str()) // rake generates very long setences sometimes - )); - // split on / and punctuation too - let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s")))) - .filter(|&(_, k)| k.len() >= 2) - .filter(|&(_, k)| STOPWORDS.get(k).is_none()); - - // replace ' ' with '-' - // keep if 3 words or less - for (i, (w2, k)) in rake_keywords.chain(keywords).take(25).enumerate() { - let w: f64 = w2 * 150. / (80 + i) as f64; - insert_keyword.add(&k, w, false); - } + for (i, (w2, k)) in c.extracted_auto_keywords.iter().enumerate() { + let w = *w2 as f64 * 150. / (80 + i) as f64; + insert_keyword.add(&k, w, false); } for feat in manifest.features.keys() { @@ -1056,37 +1035,6 @@ impl CrateDb { }).await } - // returns an array of lowercase phrases - fn extract_text_phrases(c: &CrateVersionData<'_>) -> Vec<(f64, String)> { - let mut out = Vec::new(); - let mut len = 0; - if let Some(s) = &c.manifest.package().description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(s) = &c.derived.github_description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(sub) = &c.readme_text { - // render readme to DOM, extract nodes - for par in sub.split('\n') { - if len > 200 { - break; - } - let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-'); - let par = par.replace("http://", " ").replace("https://", " "); - if !par.is_empty() { - let par = par.to_lowercase(); - len += par.len(); - out.push((0.4, par)); - } - } - } - out - } } pub enum RepoChange { @@ -1179,19 +1127,6 @@ impl KeywordInsert { } } -fn chop3words(s: &str) -> &str { - let mut words = 0; - for (pos, ch) in s.char_indices() { - if ch == ' ' { - words += 1; - if words >= 3 { - return &s[0..pos]; - } - } - } - s -} - pub struct CrateOwnerRow { crate_id: u32, invited_by_github_id: Option<u32>, @@ -1236,7 +1171,7 @@ categories = ["1", "two", "GAMES", "science", "::science::math::"] authors: &[], category_slugs: &[], repository: None, - readme_text: None, + extracted_auto_keywords: Vec::new(), }).await.unwrap(); assert_eq!(1, db.crates_with_keyword("test-crate").await.unwrap()); let (new_manifest, new_derived) = db.rich_crate_version_data(&origin).await.unwrap(); |