summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKornel <kornel@geekhood.net>2020-03-02 23:59:03 +0000
committerKornel <kornel@geekhood.net>2020-03-03 19:24:32 +0000
commit87af4966975e6b70b176f9fe75877ba99b7a6264 (patch)
tree47c622b98d9533e0d1891ed5ebe044a1b4e4045b
parent8d792a73a64b92b298872c1726013b19205bab53 (diff)
Extract keyword extraction
-rw-r--r--Cargo.toml1
-rw-r--r--crate_db/Cargo.toml1
-rw-r--r--crate_db/src/lib_crate_db.rs75
-rw-r--r--crate_db/src/stopwords.rs2
-rw-r--r--feat_extractor/Cargo.toml10
-rw-r--r--feat_extractor/src/lib.rs104
-rw-r--r--kitchen_sink/Cargo.toml1
-rw-r--r--kitchen_sink/src/lib_kitchen_sink.rs4
8 files changed, 125 insertions, 73 deletions
diff --git a/Cargo.toml b/Cargo.toml
index b4f79c9..d4c55a7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,6 +23,7 @@ members = [
"search_index",
"ranking",
"server",
+"feat_extractor",
"deps_index",
"render_readme/dump",
]
diff --git a/crate_db/Cargo.toml b/crate_db/Cargo.toml
index 44d0de2..558301a 100644
--- a/crate_db/Cargo.toml
+++ b/crate_db/Cargo.toml
@@ -18,7 +18,6 @@ lazy_static = "1.4.0"
chrono = "0.4.10"
thread_local = "1.0.0"
parking_lot = "0.10.0"
-rake = { git = "https://github.com/kornelski/rake-rs" }
rmp-serde = "0.14.0"
heck = "0.3.1"
semver = "0.9.0"
diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs
index 725df95..857bbf5 100644
--- a/crate_db/src/lib_crate_db.rs
+++ b/crate_db/src/lib_crate_db.rs
@@ -52,7 +52,7 @@ pub struct CrateVersionData<'a> {
pub authors: &'a [rich_crate::Author],
pub category_slugs: &'a [Cow<'a, str>],
pub repository: Option<&'a Repo>,
- pub readme_text: Option<String>,
+ pub extracted_auto_keywords: Vec<(f32, String)>,
}
impl CrateDb {
@@ -258,30 +258,9 @@ impl CrateDb {
// add nonsense keywords if applied to freeform text
insert_keyword.add_synonyms(&self.tag_synonyms);
- {
- let d = Self::extract_text_phrases(&c);
- let mut sw = rake::StopWords::new();
- sw.reserve(STOPWORDS.len());
- sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine
- // normalize space and _ to -
- let r = rake::Rake::new(sw);
- let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str()));
- let rake_keywords = rake_keywords.iter()
- .map(|k| (
- k.score.min(1.1), //
- chop3words(k.keyword.as_str()) // rake generates very long setences sometimes
- ));
- // split on / and punctuation too
- let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s"))))
- .filter(|&(_, k)| k.len() >= 2)
- .filter(|&(_, k)| STOPWORDS.get(k).is_none());
-
- // replace ' ' with '-'
- // keep if 3 words or less
- for (i, (w2, k)) in rake_keywords.chain(keywords).take(25).enumerate() {
- let w: f64 = w2 * 150. / (80 + i) as f64;
- insert_keyword.add(&k, w, false);
- }
+ for (i, (w2, k)) in c.extracted_auto_keywords.iter().enumerate() {
+ let w = *w2 as f64 * 150. / (80 + i) as f64;
+ insert_keyword.add(&k, w, false);
}
for feat in manifest.features.keys() {
@@ -1056,37 +1035,6 @@ impl CrateDb {
}).await
}
- // returns an array of lowercase phrases
- fn extract_text_phrases(c: &CrateVersionData<'_>) -> Vec<(f64, String)> {
- let mut out = Vec::new();
- let mut len = 0;
- if let Some(s) = &c.manifest.package().description {
- let s = s.to_lowercase();
- len += s.len();
- out.push((1., s));
- }
- if let Some(s) = &c.derived.github_description {
- let s = s.to_lowercase();
- len += s.len();
- out.push((1., s));
- }
- if let Some(sub) = &c.readme_text {
- // render readme to DOM, extract nodes
- for par in sub.split('\n') {
- if len > 200 {
- break;
- }
- let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-');
- let par = par.replace("http://", " ").replace("https://", " ");
- if !par.is_empty() {
- let par = par.to_lowercase();
- len += par.len();
- out.push((0.4, par));
- }
- }
- }
- out
- }
}
pub enum RepoChange {
@@ -1179,19 +1127,6 @@ impl KeywordInsert {
}
}
-fn chop3words(s: &str) -> &str {
- let mut words = 0;
- for (pos, ch) in s.char_indices() {
- if ch == ' ' {
- words += 1;
- if words >= 3 {
- return &s[0..pos];
- }
- }
- }
- s
-}
-
pub struct CrateOwnerRow {
crate_id: u32,
invited_by_github_id: Option<u32>,
@@ -1236,7 +1171,7 @@ categories = ["1", "two", "GAMES", "science", "::science::math::"]
authors: &[],
category_slugs: &[],
repository: None,
- readme_text: None,
+ extracted_auto_keywords: Vec::new(),
}).await.unwrap();
assert_eq!(1, db.crates_with_keyword("test-crate").await.unwrap());
let (new_manifest, new_derived) = db.rich_crate_version_data(&origin).await.unwrap();
diff --git a/crate_db/src/stopwords.rs b/crate_db/src/stopwords.rs
index 8eb0099..161e951 100644
--- a/crate_db/src/stopwords.rs
+++ b/crate_db/src/stopwords.rs
@@ -25,7 +25,7 @@ lazy_static! {
"some", "specific", "still", "stuff", "such", "take", "than", "that", "the",
"their", "them", "then", "there", "therefore", "these", "they", "things",
"this", "those", "to", "todo", "too", "travis", "two", "under", "us",
- "usable", "use", "used", "useful", "using", "v1", "v2", "v3", "v4", "various",
+ "usable", "use", "used", "useful", "using", "usage", "v1", "v2", "v3", "v4", "various",
"very", "via", "want", "way", "well", "we'll", "what", "when", "where", "which",
"while", "will", "wip", "with", "without", "working", "works", "writing",
"written", "yet", "you", "your", "build status", "meritbadge", "common",
diff --git a/feat_extractor/Cargo.toml b/feat_extractor/Cargo.toml
new file mode 100644
index 0000000..9a6ece8
--- /dev/null
+++ b/feat_extractor/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "feat_extractor"
+version = "0.1.0"
+authors = ["Kornel <kornel@geekhood.net>"]
+edition = "2018"
+
+[dependencies]
+rake = { git = "https://github.com/kornelski/rake-rs" }
+rich_crate = { path = "../rich_crate" }
+lazy_static = "1.4.0"
diff --git a/feat_extractor/src/lib.rs b/feat_extractor/src/lib.rs
new file mode 100644
index 0000000..fae0c4a
--- /dev/null
+++ b/feat_extractor/src/lib.rs
@@ -0,0 +1,104 @@
+use std::collections::HashSet;
+use rich_crate::ManifestExt;
+use rich_crate::Manifest;
+
+
+lazy_static::lazy_static! {
+ /// ignore these as keywords
+ pub(crate) static ref STOPWORDS: HashSet<&'static str> = [
+ "a", "sys", "ffi", "placeholder", "app", "loops", "master", "library", "rs",
+ "accidentally", "additional", "adds", "against", "all", "allow", "allows",
+ "already", "also", "alternative", "always", "an", "and", "any", "appropriate",
+ "arbitrary", "are", "as", "at", "available", "based", "be", "because", "been",
+ "both", "but", "by", "can", "certain", "changes", "comes", "contains", "core", "cost",
+ "crate", "crates.io", "current", "currently", "custom", "dependencies",
+ "dependency", "developers", "do", "don't", "e.g", "easily", "easy", "either",
+ "enables", "etc", "even", "every", "example", "examples", "features", "feel",
+ "files", "for", "from", "fully", "function", "get", "given", "had", "has",
+ "have", "here", "if", "implementing", "implements", "in", "includes",
+ "including", "incurring", "installation", "interested", "into", "is", "it",
+ "it's", "its", "itself", "just", "known", "large", "later", "library",
+ "license", "lightweight", "like", "made", "main", "make", "makes", "many",
+ "may", "me", "means", "method", "minimal", "mit", "more", "mostly", "much",
+ "need", "needed", "never", "new", "no", "noop", "not", "of", "on", "one",
+ "only", "or", "other", "over", "plausible", "please", "possible", "program",
+ "provides", "put", "readme", "release", "runs", "rust", "rust's", "same",
+ "see", "selected", "should", "similar", "simple", "simply", "since", "small", "so",
+ "some", "specific", "still", "stuff", "such", "take", "than", "that", "the",
+ "their", "them", "then", "there", "therefore", "these", "they", "things",
+ "this", "those", "to", "todo", "too", "travis", "two", "under", "us",
+ "usable", "use", "used", "useful", "using", "usage", "v1", "v2", "v3", "v4", "various",
+ "very", "via", "want", "way", "well", "we'll", "what", "when", "where", "which",
+ "while", "will", "wip", "with", "without", "working", "works", "writing",
+ "written", "yet", "you", "your", "build status", "meritbadge", "common",
+ "file was generated", "easy to use",
+ ].iter().copied().collect();
+}
+
+ // returns an array of lowercase phrases
+ fn extract_text_phrases(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f64, String)> {
+ let mut out = Vec::new();
+ let mut len = 0;
+ if let Some(s) = &manifest.package().description {
+ let s = s.to_lowercase();
+ len += s.len();
+ out.push((1., s));
+ }
+ if let Some(s) = github_description {
+ let s = s.to_lowercase();
+ len += s.len();
+ out.push((1., s));
+ }
+ if let Some(sub) = &readme_text {
+ // render readme to DOM, extract nodes
+ for par in sub.split('\n') {
+ if len > 200 {
+ break;
+ }
+ let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-');
+ let par = par.replace("http://", " ").replace("https://", " ");
+ if !par.is_empty() {
+ let par = par.to_lowercase();
+ len += par.len();
+ out.push((0.4, par));
+ }
+ }
+ }
+ out
+ }
+
+ pub fn auto_keywords(manifest: &Manifest, github_description: Option<&str>, readme_text: Option<&str>) -> Vec<(f32, String)> {
+ let d = extract_text_phrases(manifest, github_description, readme_text);
+ let mut sw = rake::StopWords::new();
+ sw.reserve(STOPWORDS.len());
+ sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine
+ // normalize space and _ to -
+ let r = rake::Rake::new(sw);
+ let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str()));
+ let rake_keywords = rake_keywords.iter()
+ .map(|k| (
+ k.score.min(1.1), //
+ chop3words(k.keyword.as_str()) // rake generates very long setences sometimes
+ ));
+ // split on / and punctuation too
+ let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s"))))
+ .filter(|&(_, k)| k.len() >= 2)
+ .filter(|&(_, k)| STOPWORDS.get(k).is_none());
+
+ // replace ' ' with '-'
+ // keep if 3 words or less
+ rake_keywords.chain(keywords).take(25).map(|(w, s)| (w as f32, s.to_owned())).collect()
+ }
+
+fn chop3words(s: &str) -> &str {
+ let mut words = 0;
+ for (pos, ch) in s.char_indices() {
+ if ch == ' ' {
+ words += 1;
+ if words >= 3 {
+ return &s[0..pos];
+ }
+ }
+ }
+ s
+}
diff --git a/kitchen_sink/Cargo.toml b/kitchen_sink/Cargo.toml
index 6ac6abf..74987f9 100644
--- a/kitchen_sink/Cargo.toml
+++ b/kitchen_sink/Cargo.toml
@@ -12,6 +12,7 @@ path = "src/lib_kitchen_sink.rs"
[dependencies]
crates_io_client = { path = "../crates_io_client" }
deps_index = { path = "../deps_index" }
+feat_extractor = { path = "../feat_extractor" }
docs_rs_client = { path = "../docs_rs_client", version = "0.4.0" }
github_info = { path = "../github_info", version = "0.9" }
crate_git_checkout = { path = "../crate_git_checkout", version = "0.4.3" }
diff --git a/kitchen_sink/src/lib_kitchen_sink.rs b/kitchen_sink/src/lib_kitchen_sink.rs
index 3e6bb16..ae33cfa 100644
--- a/kitchen_sink/src/lib_kitchen_sink.rs
+++ b/kitchen_sink/src/lib_kitchen_sink.rs
@@ -1369,8 +1369,9 @@ impl KitchenSink {
&tmp
};
+ let extracted_auto_keywords = feat_extractor::auto_keywords(&manifest, src.github_description.as_deref(), readme_text.as_deref());
+
self.crate_db.index_latest(CrateVersionData {
- readme_text,
category_slugs,
authors: &authors,
origin,
@@ -1379,6 +1380,7 @@ impl KitchenSink {
is_build, is_dev,
manifest: &manifest,
derived: &src,
+ extracted_auto_keywords,
}).await?;
Ok(())
}