summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAlejandro Gallo <aamsgallo@gmail.com>2019-01-22 23:38:45 +0100
committerAlejandro Gallo <aamsgallo@gmail.com>2019-01-22 23:38:45 +0100
commit7f3126e9325aa8095921c5d6d9c0ee173b7777bd (patch)
tree3c458f57f6b842b88f052e6d7457d2caa33041d9
parent611719105f39127a4384ab89173c10425b155b38 (diff)
Add xapian testxapian
-rw-r--r--papis/database/__init__.py4
-rw-r--r--papis/database/xapian.py333
2 files changed, 337 insertions, 0 deletions
diff --git a/papis/database/__init__.py b/papis/database/__init__.py
index ac284683..b500f6a3 100644
--- a/papis/database/__init__.py
+++ b/papis/database/__init__.py
@@ -24,6 +24,10 @@ def get(library=None):
import papis.database.whoosh
DATABASES[library] = papis.database.whoosh.Database(library)
return DATABASES.get(library)
+ elif backend == "xapian":
+ import papis.database.xapian
+ DATABASES[library] = papis.database.xapian.Database(library)
+ return DATABASES.get(library)
else:
raise Exception('No valid database type: {}'.format(backend))
diff --git a/papis/database/xapian.py b/papis/database/xapian.py
new file mode 100644
index 00000000..2c0626cf
--- /dev/null
+++ b/papis/database/xapian.py
@@ -0,0 +1,333 @@
+import os
+import sys
+import xapian
+
+from source import Sources
+from documents import Documents, Document
+
+import papis.utils
+import papis.database.base
+import papis.database.cache
+
+class Database(papis.database.base.Database):
+
+ # http://xapian.org/docs/omega/termprefixes.html
+ BOOLEAN_PREFIX_INTERNAL = {
+ # FIXME: use this for doi?
+ #'url': 'U',
+ 'file': 'P',
+
+ # FIXME: use this for doc mime type
+ 'type': 'T',
+ }
+
+ BOOLEAN_PREFIX_EXTERNAL = {
+ 'id': 'Q',
+ 'key': 'XBIB|',
+ 'source': 'XSOURCE|',
+ 'tag': 'K',
+ 'year': 'Y',
+ 'y': 'Y',
+ }
+
+ PROBABILISTIC_PREFIX = {
+ 'title': 'S',
+ 't': 'S',
+ 'author': 'A',
+ 'a': 'A',
+ }
+
+ # http://xapian.org/docs/facets
+ NUMBER_VALUE_FACET = {
+ 'year': 0,
+ 'y': 0,
+ }
+
+ # FIXME: need to set the following value fields:
+ # publication date
+ # added date
+ # modified date
+
+ def get_backend_name(self):
+ return 'xapian'
+
+ def _find_prefix(self, name):
+ if name in self.BOOLEAN_PREFIX_INTERNAL:
+ return self.BOOLEAN_PREFIX_INTERNAL[name]
+ if name in self.BOOLEAN_PREFIX_EXTERNAL:
+ return self.BOOLEAN_PREFIX_EXTERNAL[name]
+ if name in self.PROBABILISTIC_PREFIX:
+ return self.PROBABILISTIC_PREFIX[name]
+
+ def _find_facet(self, name):
+ if name in self.NUMBER_VALUE_FACET:
+ return self.NUMBER_VALUE_FACET[name]
+
+ def _make_source_prefix(self, source):
+ return 'X%s|' % (source.upper())
+
+ ########################################
+
+ def __init__(self, library=None, writable=False, create=False, force=False):
+ # xapers root
+ self.logger = logging.getLogger('db:xapian')
+ self.initialize()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exc_type, exc_value, traceback):
+ self.xapian.close()
+
+ def reopen(self):
+ self.xapian.reopen()
+
+ def __contains__(self, docid):
+ try:
+ self.xapian.get_document(docid)
+ return True
+ except xapian.DocNotFoundError:
+ return False
+
+ def __getitem__(self, docid):
+ if type(docid) not in [int, long]:
+ raise TypeError("docid must be an int")
+ xapian_doc = self.xapian.get_document(docid)
+ return Document(self, xapian_doc)
+
+ ########################################
+
+ # generate a new doc id, based on the last availabe doc id
+ def _generate_docid(self):
+ return self.xapian.get_lastdocid() + 1
+
+ ########################################
+
+ # return a list of terms for prefix
+ def _term_iter(self, prefix=None):
+ term_iter = iter(self.xapian)
+ if prefix:
+ plen = len(prefix)
+ term = term_iter.skip_to(prefix)
+ if not term.term.startswith(prefix):
+ return
+ yield term.term[plen:]
+ for term in term_iter:
+ if prefix:
+ if not term.term.startswith(prefix):
+ break
+ yield term.term[plen:]
+ else:
+ yield term.term
+
+ def term_iter(self, name=None):
+ """Iterator over all terms in the database.
+
+ If a prefix is provided, will iterate over only the prefixed
+ terms, and the prefix will be removed from the returned terms.
+
+ """
+ prefix = None
+ if name:
+ prefix = self._find_prefix(name)
+ if not prefix:
+ prefix = name
+ return self._term_iter(prefix)
+
+ def get_sids(self):
+ """Get all sources in database."""
+ sids = []
+ # FIXME: do this more efficiently
+ for source in self.term_iter('source'):
+ for oid in self._term_iter(self._make_source_prefix(source)):
+ sids.append('%s:%s' % (source, oid))
+ return sids
+
+ ########################################
+
+ # search for documents based on query string
+ def _search(self, query_string, limit=None):
+ enquire = xapian.Enquire(self.xapian)
+
+ if query_string == "*":
+ query = xapian.Query.MatchAll
+ else:
+ # parse the query string to produce a Xapian::Query object.
+ query = self.query_parser.parse_query(query_string)
+
+ if os.getenv('XAPERS_DEBUG_QUERY'):
+ print >>sys.stderr, "query string:", query_string
+ print >>sys.stderr, "final query:", query
+
+ # FIXME: need to catch Xapian::Error when using enquire
+ enquire.set_query(query)
+
+ # set order of returned docs as newest first
+ # FIXME: make this user specifiable
+ enquire.set_docid_order(xapian.Enquire.DESCENDING)
+
+ if limit:
+ mset = enquire.get_mset(0, limit)
+ else:
+ mset = enquire.get_mset(0, self.xapian.get_doccount())
+
+ return mset
+
+ def search(self, query_string, limit=0):
+ """Search for documents in the database."""
+ mset = self._search(query_string, limit)
+ return Documents(self, mset)
+
+ def count(self, query_string):
+ """Count documents matching search terms."""
+ return self._search(query_string).get_matches_estimated()
+
+ def _doc_for_term(self, term):
+ enquire = xapian.Enquire(self.xapian)
+ query = xapian.Query(term)
+ enquire.set_query(query)
+ mset = enquire.get_mset(0, 2)
+ # FIXME: need to throw an exception if more than one match found
+ if mset:
+ return Document(self, mset[0].document)
+ else:
+ return None
+
+ def doc_for_path(self, path):
+ """Return document for specified path."""
+ term = self._find_prefix('file') + path
+ return self._doc_for_term(term)
+
+ def doc_for_source(self, sid):
+ """Return document for source id string."""
+ source, oid = sid.split(':', 1)
+ term = self._make_source_prefix(source) + oid
+ return self._doc_for_term(term)
+
+ def doc_for_bib(self, bibkey):
+ """Return document for bibtex key."""
+ term = self._find_prefix('key') + bibkey
+ return self._doc_for_term(term)
+
+ ########################################
+
+ def replace_document(self, docid, doc):
+ """Replace (sync) document to database."""
+ self.xapian.replace_document(docid, doc)
+
+ def delete_document(self, docid):
+ """Delete document from database."""
+ self.xapian.delete_document(docid)
+
+ def get_id_key(self):
+ """Get the unique key identifier name of the documents in the database
+
+ :returns: key identifier
+ :rtype: str
+ """
+ return 'xapian_id_'
+
+ def get_id_value(self, document):
+ """Get the value that is stored in the unique key identifier
+ of the documents in the database. In the case of papis this is
+ just the path of the documents.
+
+ :param document: Papis document
+ :type document: papis.document.Document
+ :returns: Path for the document
+ :rtype: str
+ """
+ return document.get_main_folder()
+
+
+ def do_indexing(self):
+ """Restore a database from an existing root."""
+ log = False
+ docdirs = papis.utils.get_folders(self.get_dir())
+ docdirs.sort()
+ documents = papis.database.cache.folders_to_documents(docdirs)
+ for doc in documents:
+ self.add(doc)
+ doc.sync()
+
+ def add(self, document):
+ xdoc = xapian.Document()
+ xdoc.add_term("{}{}".format(self._find_prefix('id'), self.docid))
+
+ def get_cache_dir(self):
+ """Get general directory to store xapian indexes.
+
+ :returns: Full path to xapian cache home directory
+ :rtype: str
+ """
+ path = os.path.join(
+ papis.database.cache.get_cache_home(),
+ 'xapian'
+ )
+ # self.logger.debug('Cache dir %s' % path)
+ return path
+
+ def initialize(self):
+ """Function to be called everytime a database object is created.
+ It checks if an index exists, if not, it creates one and
+ indexes the library.
+ """
+ self.root = self.get_cache_dir()
+ # db directory
+ index_path = self.get_index_dir()
+
+ # directory initialization
+ if not os.path.exists(index_path):
+ os.makedirs(index_path)
+
+ #self.xapian = xapian.Database(index_path)
+ self.xapian = xapian.WritableDatabase(
+ index_path, xapian.DB_CREATE_OR_OPEN
+ )
+ stemmer = xapian.Stem("english")
+
+ # The Xapian TermGenerator
+ # http://trac.xapian.org/wiki/FAQ/TermGenerator
+ self.term_gen = xapian.TermGenerator()
+ self.term_gen.set_stemmer(stemmer)
+
+ # The Xapian QueryParser
+ self.query_parser = xapian.QueryParser()
+ self.query_parser.set_database(self.xapian)
+ self.query_parser.set_stemmer(stemmer)
+ self.query_parser.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
+ self.query_parser.set_default_op(xapian.Query.OP_AND)
+
+ # add boolean internal prefixes
+ for name, prefix in self.BOOLEAN_PREFIX_EXTERNAL.iteritems():
+ self.query_parser.add_boolean_prefix(name, prefix)
+
+ # add probabalistic prefixes
+ for name, prefix in self.PROBABILISTIC_PREFIX.iteritems():
+ self.query_parser.add_prefix(name, prefix)
+
+ # add value facets
+ for name, facet in self.NUMBER_VALUE_FACET.iteritems():
+ self.query_parser.add_valuerangeprocessor(
+ xapian.NumberValueRangeProcessor(facet, name+':')
+ )
+
+ # register known source prefixes
+ # FIXME: can we do this by just finding all XSOURCE terms in
+ # db? Would elliminate dependence on source modules at
+ # search time.
+ for source in Sources():
+ name = source.name
+ self.query_parser.add_boolean_prefix(name, self._make_source_prefix(name))
+
+ def get_index_dir(self):
+ """Get the directory inside `get_cache_dir` to store the index.
+ :returns: Full path to index dir
+ :rtype: str
+ """
+ path = os.path.expanduser(
+ os.path.join(
+ self.get_cache_dir(),
+ papis.database.cache.get_name(self.get_dir())
+ )
+ )