summaryrefslogtreecommitdiffstats
path: root/crate_db
diff options
context:
space:
mode:
authorKornel <kornel@geekhood.net>2020-03-02 23:59:03 +0000
committerKornel <kornel@geekhood.net>2020-03-03 19:24:32 +0000
commit87af4966975e6b70b176f9fe75877ba99b7a6264 (patch)
tree47c622b98d9533e0d1891ed5ebe044a1b4e4045b /crate_db
parent8d792a73a64b92b298872c1726013b19205bab53 (diff)
Extract keyword extraction
Diffstat (limited to 'crate_db')
-rw-r--r--crate_db/Cargo.toml1
-rw-r--r--crate_db/src/lib_crate_db.rs75
-rw-r--r--crate_db/src/stopwords.rs2
3 files changed, 6 insertions, 72 deletions
diff --git a/crate_db/Cargo.toml b/crate_db/Cargo.toml
index 44d0de2..558301a 100644
--- a/crate_db/Cargo.toml
+++ b/crate_db/Cargo.toml
@@ -18,7 +18,6 @@ lazy_static = "1.4.0"
chrono = "0.4.10"
thread_local = "1.0.0"
parking_lot = "0.10.0"
-rake = { git = "https://github.com/kornelski/rake-rs" }
rmp-serde = "0.14.0"
heck = "0.3.1"
semver = "0.9.0"
diff --git a/crate_db/src/lib_crate_db.rs b/crate_db/src/lib_crate_db.rs
index 725df95..857bbf5 100644
--- a/crate_db/src/lib_crate_db.rs
+++ b/crate_db/src/lib_crate_db.rs
@@ -52,7 +52,7 @@ pub struct CrateVersionData<'a> {
pub authors: &'a [rich_crate::Author],
pub category_slugs: &'a [Cow<'a, str>],
pub repository: Option<&'a Repo>,
- pub readme_text: Option<String>,
+ pub extracted_auto_keywords: Vec<(f32, String)>,
}
impl CrateDb {
@@ -258,30 +258,9 @@ impl CrateDb {
// add nonsense keywords if applied to freeform text
insert_keyword.add_synonyms(&self.tag_synonyms);
- {
- let d = Self::extract_text_phrases(&c);
- let mut sw = rake::StopWords::new();
- sw.reserve(STOPWORDS.len());
- sw.extend(STOPWORDS.iter().map(|s| s.to_string())); // TODO: use real stopwords, THEN filter via STOPWORDS again, because multiple Rust-y words are fine
- // normalize space and _ to -
- let r = rake::Rake::new(sw);
- let rake_keywords = r.run_sentences(d.iter().map(|(_, s)| s.as_str()));
- let rake_keywords = rake_keywords.iter()
- .map(|k| (
- k.score.min(1.1), //
- chop3words(k.keyword.as_str()) // rake generates very long setences sometimes
- ));
- // split on / and punctuation too
- let keywords = d.iter().flat_map(|&(w2, ref d)| d.split_whitespace().map(move |s| (w2, s.trim_end_matches("'s"))))
- .filter(|&(_, k)| k.len() >= 2)
- .filter(|&(_, k)| STOPWORDS.get(k).is_none());
-
- // replace ' ' with '-'
- // keep if 3 words or less
- for (i, (w2, k)) in rake_keywords.chain(keywords).take(25).enumerate() {
- let w: f64 = w2 * 150. / (80 + i) as f64;
- insert_keyword.add(&k, w, false);
- }
+ for (i, (w2, k)) in c.extracted_auto_keywords.iter().enumerate() {
+ let w = *w2 as f64 * 150. / (80 + i) as f64;
+ insert_keyword.add(&k, w, false);
}
for feat in manifest.features.keys() {
@@ -1056,37 +1035,6 @@ impl CrateDb {
}).await
}
- // returns an array of lowercase phrases
- fn extract_text_phrases(c: &CrateVersionData<'_>) -> Vec<(f64, String)> {
- let mut out = Vec::new();
- let mut len = 0;
- if let Some(s) = &c.manifest.package().description {
- let s = s.to_lowercase();
- len += s.len();
- out.push((1., s));
- }
- if let Some(s) = &c.derived.github_description {
- let s = s.to_lowercase();
- len += s.len();
- out.push((1., s));
- }
- if let Some(sub) = &c.readme_text {
- // render readme to DOM, extract nodes
- for par in sub.split('\n') {
- if len > 200 {
- break;
- }
- let par = par.trim_start_matches(|c: char| c.is_whitespace() || c == '#' || c == '=' || c == '*' || c == '-');
- let par = par.replace("http://", " ").replace("https://", " ");
- if !par.is_empty() {
- let par = par.to_lowercase();
- len += par.len();
- out.push((0.4, par));
- }
- }
- }
- out
- }
}
pub enum RepoChange {
@@ -1179,19 +1127,6 @@ impl KeywordInsert {
}
}
-fn chop3words(s: &str) -> &str {
- let mut words = 0;
- for (pos, ch) in s.char_indices() {
- if ch == ' ' {
- words += 1;
- if words >= 3 {
- return &s[0..pos];
- }
- }
- }
- s
-}
-
pub struct CrateOwnerRow {
crate_id: u32,
invited_by_github_id: Option<u32>,
@@ -1236,7 +1171,7 @@ categories = ["1", "two", "GAMES", "science", "::science::math::"]
authors: &[],
category_slugs: &[],
repository: None,
- readme_text: None,
+ extracted_auto_keywords: Vec::new(),
}).await.unwrap();
assert_eq!(1, db.crates_with_keyword("test-crate").await.unwrap());
let (new_manifest, new_derived) = db.rich_crate_version_data(&origin).await.unwrap();
diff --git a/crate_db/src/stopwords.rs b/crate_db/src/stopwords.rs
index 8eb0099..161e951 100644
--- a/crate_db/src/stopwords.rs
+++ b/crate_db/src/stopwords.rs
@@ -25,7 +25,7 @@ lazy_static! {
"some", "specific", "still", "stuff", "such", "take", "than", "that", "the",
"their", "them", "then", "there", "therefore", "these", "they", "things",
"this", "those", "to", "todo", "too", "travis", "two", "under", "us",
- "usable", "use", "used", "useful", "using", "v1", "v2", "v3", "v4", "various",
+ "usable", "use", "used", "useful", "using", "usage", "v1", "v2", "v3", "v4", "various",
"very", "via", "want", "way", "well", "we'll", "what", "when", "where", "which",
"while", "will", "wip", "with", "without", "working", "works", "writing",
"written", "yet", "you", "your", "build status", "meritbadge", "common",