From 87af4966975e6b70b176f9fe75877ba99b7a6264 Mon Sep 17 00:00:00 2001 From: Kornel Date: Mon, 2 Mar 2020 23:59:03 +0000 Subject: Extract keyword extraction --- Cargo.toml | 1 + crate_db/Cargo.toml | 1 - crate_db/src/lib_crate_db.rs | 75 ++----------------------- crate_db/src/stopwords.rs | 2 +- feat_extractor/Cargo.toml | 10 ++++ feat_extractor/src/lib.rs | 104 +++++++++++++++++++++++++++++++++++ kitchen_sink/Cargo.toml | 1 + kitchen_sink/src/lib_kitchen_sink.rs | 4 +- 8 files changed, 125 insertions(+), 73 deletions(-) create mode 100644 feat_extractor/Cargo.toml create mode 100644 feat_extractor/src/lib.rs diff --git a/Cargo.toml b/Cargo.toml index b4f79c9..d4c55a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ members = [ "search_index", "ranking", "server", +"feat_extractor", "deps_index", "render_readme/dump", ] diff --git a/crate_db/Cargo.toml b/crate_db/Cargo.toml index 44d0de2..558301a 100644 --- a/crate_db/Cargo.toml +++ b/crate_db/Cargo.toml @@ -18,7 +18,6 @@ lazy_static = "1.4.0" chrono = "0.4.10" thread_local = "1.0.0" parking_lot = "0.10.0" -rake = { git = "https://github.com/kornelski/rake-rs" } rmp-serde = "0.14.0" heck = "0.3.1" semver = "0.9.0" diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs index 725df95..857bbf5 100644 --- a/crate_db/src/lib_crate_db.rs +++ b/crate_db/src/lib_crate_db.rs @@ -52,7 +52,7 @@ pub struct CrateVersionData<'a> { pub authors: &'a [rich_crate::Author], pub category_slugs: &'a [Cow<'a, str>], pub repository: Option<&'a Repo>, - pub readme_text: Option, + pub extracted_auto_keywords: Vec<(f32, String)>, } impl CrateDb { @@ -258,30 +258,9 @@ impl CrateDb { // add nonsense keywords if applied to freeform text insert_keyword.add_synonyms(&self.tag_synonyms); - { - let d = Self::extract_text_phrases(&c); - let mut sw = rake::StopWords::new(); - sw.reserve(STOPWORDS.len()); - sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine - // normalize space and _ to - - let r = rake::Rake::new(sw); - let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str())); - let rake_keywords = rake_keywords.iter() - .map(|k| ( - k.score.min(1.1), // - chop3words(k.keyword.as_str()) // rake generates very long setences sometimes - )); - // split on / and punctuation too - let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s")))) - .filter(|&(_, k)| k.len() >= 2) - .filter(|&(_, k)| STOPWORDS.get(k).is_none()); - - // replace ' ' with '-' - // keep if 3 words or less - for (i, (w2, k)) in rake_keywords.chain(keywords).take(25).enumerate() { - let w: f64 = w2 * 150. / (80 + i) as f64; - insert_keyword.add(&k, w, false); - } + for (i, (w2, k)) in c.extracted_auto_keywords.iter().enumerate() { + let w = *w2 as f64 * 150. / (80 + i) as f64; + insert_keyword.add(&k, w, false); } for feat in manifest.features.keys() { @@ -1056,37 +1035,6 @@ impl CrateDb { }).await } - // returns an array of lowercase phrases - fn extract_text_phrases(c: &CrateVersionData<'_>) -> Vec<(f64, String)> { - let mut out = Vec::new(); - let mut len = 0; - if let Some(s) = &c.manifest.package().description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(s) = &c.derived.github_description { - let s = s.to_lowercase(); - len += s.len(); - out.push((1., s)); - } - if let Some(sub) = &c.readme_text { - // render readme to DOM, extract nodes - for par in sub.split('\n') { - if len > 200 { - break; - } - let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-'); - let par = par.replace("http://", " ").replace("https://", " "); - if !par.is_empty() { - let par = par.to_lowercase(); - len += par.len(); - out.push((0.4, par)); - } - } - } - out - } } pub enum RepoChange { @@ -1179,19 +1127,6 @@ impl KeywordInsert { } } -fn chop3words(s: &str) -> &str { - let mut words = 0; - for (pos, ch) in s.char_indices() { - if ch == ' ' { - words += 1; - if words >= 3 { - return &s[0..pos]; - } - } - } - s -} - pub struct CrateOwnerRow { crate_id: u32, invited_by_github_id: Option, @@ -1236,7 +1171,7 @@ categories = ["1", "two", "GAMES", "science", "::science::math::"] authors: &[], category_slugs: &[], repository: None, - readme_text: None, + extracted_auto_keywords: Vec::new(), }).await.unwrap(); assert_eq!(1, db.crates_with_keyword("test-crate").await.unwrap()); let (new_manifest, new_derived) = db.rich_crate_version_data(&origin).await.unwrap(); diff --git a/crate_db/src/stopwords.rs b/crate_db/src/stopwords.rs index 8eb0099..161e951 100644 --- a/crate_db/src/stopwords.rs +++ b/crate_db/src/stopwords.rs @@ -25,7 +25,7 @@ lazy_static! { "some", "specific", "still", "stuff", "such", "take", "than", "that", "the", "their", "them", "then", "there", "therefore", "these", "they", "things", "this", "those", "to", "todo", "too", "travis", "two", "under", "us", - "usable", "use", "used", "useful", "using", "v1", "v2", "v3", "v4", "various", + "usable", "use", "used", "useful", "using", "usage", "v1", "v2", "v3", "v4", "various", "very", "via", "want", "way", "well", "we'll", "what", "when", "where", "which", "while", "will", "wip", "with", "without", "working", "works", "writing", "written", "yet", "you", "your", "build status", "meritbadge", "common", diff --git a/feat_extractor/Cargo.toml b/feat_extractor/Cargo.toml new file mode 100644 index 0000000..9a6ece8 --- /dev/null +++ b/feat_extractor/Cargo.toml @@ -0,0 +1,10 @@ +[package] +name = "feat_extractor" +version = "0.1.0" +authors = ["Kornel "] +edition = "2018" + +[dependencies] +rake = { git = "https://github.com/kornelski/rake-rs" } +rich_crate = { path = "../rich_crate" } +lazy_static = "1.4.0" diff --git a/feat_extractor/src/lib.rs b/feat_extractor/src/lib.rs new file mode 100644 index 0000000..fae0c4a --- /dev/null +++ b/feat_extractor/src/lib.rs @@ -0,0 +1,104 @@ +use std::collections::HashSet; +use rich_crate::ManifestExt; +use rich_crate::Manifest; + + +lazy_static::lazy_static! { + /// ignore these as keywords + pub(crate) static ref STOPWORDS: HashSet<&'static str> = [ + "a", "sys", "ffi", "placeholder", "app", "loops", "master", "library", "rs", + "accidentally", "additional", "adds", "against", "all", "allow", "allows", + "already", "also", "alternative", "always", "an", "and", "any", "appropriate", + "arbitrary", "are", "as", "at", "available", "based", "be", "because", "been", + "both", "but", "by", "can", "certain", "changes", "comes", "contains", "core", "cost", + "crate", "crates.io", "current", "currently", "custom", "dependencies", + "dependency", "developers", "do", "don't", "e.g", "easily", "easy", "either", + "enables", "etc", "even", "every", "example", "examples", "features", "feel", + "files", "for", "from", "fully", "function", "get", "given", "had", "has", + "have", "here", "if", "implementing", "implements", "in", "includes", + "including", "incurring", "installation", "interested", "into", "is", "it", + "it's", "its", "itself", "just", "known", "large", "later", "library", + "license", "lightweight", "like", "made", "main", "make", "makes", "many", + "may", "me", "means", "method", "minimal", "mit", "more", "mostly", "much", + "need", "needed", "never", "new", "no", "noop", "not", "of", "on", "one", + "only", "or", "other", "over", "plausible", "please", "possible", "program", + "provides", "put", "readme", "release", "runs", "rust", "rust's", "same", + "see", "selected", "should", "similar", "simple", "simply", "since", "small", "so", + "some", "specific", "still", "stuff", "such", "take", "than", "that", "the", + "their", "them", "then", "there", "therefore", "these", "they", "things", + "this", "those", "to", "todo", "too", "travis", "two", "under", "us", + "usable", "use", "used", "useful", "using", "usage", "v1", "v2", "v3", "v4", "various", + "very", "via", "want", "way", "well", "we'll", "what", "when", "where", "which", + "while", "will", "wip", "with", "without", "working", "works", "writing", + "written", "yet", "you", "your", "build status", "meritbadge", "common", + "file was generated", "easy to use", + ].iter().copied().collect(); +} + + // returns an array of lowercase phrases + fn extract_text_phrases(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f64, String)> { + let mut out = Vec::new(); + let mut len = 0; + if let Some(s) = &manifest.package().description { + let s = s.to_lowercase(); + len += s.len(); + out.push((1., s)); + } + if let Some(s) = github_description { + let s = s.to_lowercase(); + len += s.len(); + out.push((1., s)); + } + if let Some(sub) = &readme_text { + // render readme to DOM, extract nodes + for par in sub.split('\n') { + if len > 200 { + break; + } + let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-'); + let par = par.replace("http://", " ").replace("https://", " "); + if !par.is_empty() { + let par = par.to_lowercase(); + len += par.len(); + out.push((0.4, par)); + } + } + } + out + } + + pub fn auto_keywords(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f32, String)> { + let d = extract_text_phrases(manifest, github_description, readme_text); + let mut sw = rake::StopWords::new(); + sw.reserve(STOPWORDS.len()); + sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine + // normalize space and _ to - + let r = rake::Rake::new(sw); + let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str())); + let rake_keywords = rake_keywords.iter() + .map(|k| ( + k.score.min(1.1), // + chop3words(k.keyword.as_str()) // rake generates very long setences sometimes + )); + // split on / and punctuation too + let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s")))) + .filter(|&(_, k)| k.len() >= 2) + .filter(|&(_, k)| STOPWORDS.get(k).is_none()); + + // replace ' ' with '-' + // keep if 3 words or less + rake_keywords.chain(keywords).take(25).map(|(w, s)| (w as f32, s.to_owned())).collect() + } + +fn chop3words(s: &str) -> &str { + let mut words = 0; + for (pos, ch) in s.char_indices() { + if ch == ' ' { + words += 1; + if words >= 3 { + return &s[0..pos]; + } + } + } + s +} diff --git a/kitchen_sink/Cargo.toml b/kitchen_sink/Cargo.toml index 6ac6abf..74987f9 100644 --- a/kitchen_sink/Cargo.toml +++ b/kitchen_sink/Cargo.toml @@ -12,6 +12,7 @@ path = "src/lib_kitchen_sink.rs" [dependencies] crates_io_client = { path = "../crates_io_client" } deps_index = { path = "../deps_index" } +feat_extractor = { path = "../feat_extractor" } docs_rs_client = { path = "../docs_rs_client", version = "0.4.0" } github_info = { path = "../github_info", version = "0.9" } crate_git_checkout = { path = "../crate_git_checkout", version = "0.4.3" } diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs index 3e6bb16..ae33cfa 100644 --- a/kitchen_sink/src/lib_kitchen_sink.rs +++ b/kitchen_sink/src/lib_kitchen_sink.rs @@ -1369,8 +1369,9 @@ impl KitchenSink { &tmp }; + let extracted_auto_keywords = feat_extractor::auto_keywords(&manifest, src.github_description.as_deref(), readme_text.as_deref()); + self.crate_db.index_latest(CrateVersionData { - readme_text, category_slugs, authors: &authors, origin, @@ -1379,6 +1380,7 @@ impl KitchenSink { is_build, is_dev, manifest: &manifest, derived: &src, + extracted_auto_keywords, }).await?; Ok(()) } -- cgit v1.2.3