diff options
author | Kornel <kornel@geekhood.net> | 2020-03-02 23:59:03 +0000 |
---|---|---|
committer | Kornel <kornel@geekhood.net> | 2020-03-03 19:24:32 +0000 |
commit | 87af4966975e6b70b176f9fe75877ba99b7a6264 (patch) | |
tree | 47c622b98d9533e0d1891ed5ebe044a1b4e4045b /crate_db | |
parent | 8d792a73a64b92b298872c1726013b19205bab53 (diff) |
Extract keyword extraction
Diffstat (limited to 'crate_db')
-rw-r--r-- | crate_db/Cargo.toml | 1 | ||||
-rw-r--r-- | crate_db/src/lib_crate_db.rs | 75 | ||||
-rw-r--r-- | crate_db/src/stopwords.rs | 2 |
3 files changed, 6 insertions, 72 deletions
diff --git a/crate_db/Cargo.toml b/crate_db/Cargo.toml index 44d0de2..558301a 100644 --- a/crate_db/Cargo.toml +++ b/crate_db/Cargo.toml @@ -18,7 +18,6 @@ lazy_static = "1.4.0" chrono = "0.4.10" thread_local = "1.0.0" parking_lot = "0.10.0" -rake = { git = "https://github.com/kornelski/rake-rs" } rmp-serde = "0.14.0" heck = "0.3.1" semver = "0.9.0" diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs index 725df95..857bbf5 100644 --- a/crate_db/src/lib_crate_db.rs +++ b/crate_db/src/lib_crate_db.rs @@ -52,7 +52,7 @@ pub struct CrateVersionData<'a> { pub authors: &'a [rich_crate::Author], pub category_slugs: &'a [Cow<'a, str>], pub repository: Option<&'a Repo>, - pub readme_text: Option<String>, + pub extracted_auto_keywords: Vec<(f32, String)>, } impl CrateDb { @@ -258,30 +258,9 @@ impl CrateDb { // add nonsense keywords if applied to freeform text insert_keyword.add_synonyms(&self.tag_synonyms); - { - let d = Self::extract_text_phrases(&c); - let mut sw = rake::StopWords::new(); - sw.reserve(STOPWORDS.len()); - sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine - // normalize space and _ to - - let r = rake::Rake::new(sw); - let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str())); - let rake_keywords = rake_keywords.iter() - .map(|k| ( - k.score.min(1.1), // - chop3words(k.keyword.as_str()) // rake generates very long setences sometimes - )); - // split on / and punctuation too - let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s")))) - .filter(|&(_, k)| k.len() >= 2) - .filter(|&(_, k)| STOPWORDS.get(k).is_none()); - - // replace ' ' with '-' - // keep if 3 words or less - for (i, (w2, k)) in rake_keywords.chain(keywords).take(25).enumerate() { - let w: f64 = w2 * 150. / (80 + i) as f64; - insert_keyword.add(&k, w, false); - } + for (i, (w2, k)) in c.extracted_auto_keywords.iter().enumerate() { + let w = *w2 as f64 * 150. / (80 + i) as f64; + insert_keyword.add(&k, w, false); } for feat in manifest.features.keys() { @@ -1056,37 +1035,6 @@ impl CrateDb { }).await } - // returns an array of lowercase phrases - fn extract_text_phrases(c: &CrateVersionData<'_>) -> Vec<(f64, String)> { - let mut out = Vec::new(); - let mut len = 0; - if let Some(s) = &c.manifest.package().description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(s) = &c.derived.github_description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(sub) = &c.readme_text { - // render readme to DOM, extract nodes - for par in sub.split('\n') { - if len > 200 { - break; - } - let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-'); - let par = par.replace("http://", " ").replace("https://", " "); - if !par.is_empty() { - let par = par.to_lowercase(); - len += par.len(); - out.push((0.4, par)); - } - } - } - out - } } pub enum RepoChange { @@ -1179,19 +1127,6 @@ impl KeywordInsert { } } -fn chop3words(s: &str) -> &str { - let mut words = 0; - for (pos, ch) in s.char_indices() { - if ch == ' ' { - words += 1; - if words >= 3 { - return &s[0..pos]; - } - } - } - s -} - pub struct CrateOwnerRow { crate_id: u32, invited_by_github_id: Option<u32>, @@ -1236,7 +1171,7 @@ categories = ["1", "two", "GAMES", "science", "::science::math::"] authors: &[], category_slugs: &[], repository: None, - readme_text: None, + extracted_auto_keywords: Vec::new(), }).await.unwrap(); assert_eq!(1, db.crates_with_keyword("test-crate").await.unwrap()); let (new_manifest, new_derived) = db.rich_crate_version_data(&origin).await.unwrap(); diff --git a/crate_db/src/stopwords.rs b/crate_db/src/stopwords.rs index 8eb0099..161e951 100644 --- a/crate_db/src/stopwords.rs +++ b/crate_db/src/stopwords.rs @@ -25,7 +25,7 @@ lazy_static! { "some", "specific", "still", "stuff", "such", "take", "than", "that", "the", "their", "them", "then", "there", "therefore", "these", "they", "things", "this", "those", "to", "todo", "too", "travis", "two", "under", "us", - "usable", "use", "used", "useful", "using", "v1", "v2", "v3", "v4", "various", + "usable", "use", "used", "useful", "using", "usage", "v1", "v2", "v3", "v4", "various", "very", "via", "want", "way", "well", "we'll", "what", "when", "where", "which", "while", "will", "wip", "with", "without", "working", "works", "writing", "written", "yet", "you", "your", "build status", "meritbadge", "common", |