X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=logical_search.py;h=e710d0b65be57912bcd7781d6c9cc80eeffe2249;hb=a9bdfd8fc9f84b7b2c09a57cd12ba32259e84d1c;hp=805ec223010b93b2a1bf68e1fdee9467daac14aa;hpb=709370b2198e09f1dbe195fe8813602a3125b7f6;p=python_utils.git diff --git a/logical_search.py b/logical_search.py index 805ec22..e710d0b 100644 --- a/logical_search.py +++ b/logical_search.py @@ -1,39 +1,52 @@ #!/usr/bin/env python3 -from __future__ import annotations +# © Copyright 2021-2022, Scott Gasch -from collections import defaultdict +"""This is a module concerned with the creation of and searching of a +corpus of documents. The corpus and index are held in memory. +""" + +from __future__ import annotations import enum import sys -from typing import ( - Any, - Dict, - List, - NamedTuple, - Optional, - Set, - Sequence, - Tuple, - Union, -) +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union class ParseError(Exception): """An error encountered while parsing a logical search expression.""" def __init__(self, message: str): + super().__init__() self.message = message -class Document(NamedTuple): - """A tuple representing a searchable document.""" +@dataclass +class Document: + """A class representing a searchable document.""" + + docid: str = '' + """A unique identifier for each document -- must be provided + by the caller. See :meth:`python_modules.id_generator.get` or + :meth:`python_modules.string_utils.generate_uuid` for potential + sources.""" - docid: str # a unique idenfier for the document - tags: Set[str] # an optional set of tags - properties: List[ - Tuple[str, str] - ] # an optional set of key->value properties - reference: Any # an optional reference to something else + tags: Set[str] = field(default_factory=set) + """A set of tag strings for this document. May be empty. Tags + are simply text labels that are associated with a document and + may be used to search for it later. + """ + + properties: List[Tuple[str, str]] = field(default_factory=list) + """A list of key->value strings for this document. May be empty. + Properties are more flexible tags that have both a label and a + value. e.g. "category:mystery" or "author:smith".""" + + reference: Optional[Any] = None + """An optional reference to something else for convenience; + interpreted only by caller code, ignored here. + """ class Operation(enum.Enum): @@ -63,7 +76,11 @@ class Operation(enum.Enum): class Corpus(object): - """A collection of searchable documents. + """A collection of searchable documents. The caller can + add documents to it (or edit existing docs) via :meth:`add_doc`, + retrieve a document given its docid via :meth:`get_doc`, and + perform various lookups of documents. The most interesting + lookup is implemented in :meth:`query`. >>> c = Corpus() >>> c.add_doc(Document( @@ -86,15 +103,29 @@ class Corpus(object): ... reference=None, ... ) ... ) + >>> c.add_doc(Document( + ... docid=3, + ... tags=set(['urgent']), + ... properties=[ + ... ('author', 'Scott'), + ... ('subject', 'car turning in front of you') + ... ], + ... reference=None, + ... ) + ... ) >>> c.query('author:Scott and important') {1} + >>> c.query('*') + {1, 2, 3} + >>> c.query('*:*') + {1, 2, 3} + >>> c.query('*:Scott') + {1, 3} """ def __init__(self) -> None: self.docids_by_tag: Dict[str, Set[str]] = defaultdict(set) - self.docids_by_property: Dict[Tuple[str, str], Set[str]] = defaultdict( - set - ) + self.docids_by_property: Dict[Tuple[str, str], Set[str]] = defaultdict(set) self.docids_with_property: Dict[str, Set[str]] = defaultdict(set) self.documents_by_docid: Dict[str, Document] = {} @@ -103,11 +134,14 @@ class Corpus(object): distinct docid that will serve as its primary identifier. If the same Document is added multiple times, only the most recent addition is indexed. If two distinct documents with - the same docid are added, the latter klobbers the former in the - indexes. + the same docid are added, the latter klobbers the former in + the indexes. See :meth:`python_modules.id_generator.get` or + :meth:`python_modules.string_utils.generate_uuid` for potential + sources of docids. Each Document may have an optional set of tags which can be - used later in expressions to the query method. + used later in expressions to the query method. These are simple + text labels. Each Document may have an optional list of key->value tuples which can be used later in expressions to the query method. @@ -116,6 +150,9 @@ class Corpus(object): never interpreted by this module. This is meant to allow easy mapping between Documents in this corpus and external objects they may represent. + + Args: + doc: the document to add or edit """ if doc.docid in self.documents_by_docid: @@ -141,13 +178,27 @@ class Corpus(object): self.docids_with_property[key].add(doc.docid) def get_docids_by_exact_tag(self, tag: str) -> Set[str]: - """Return the set of docids that have a particular tag.""" + """Return the set of docids that have a particular tag. + + Args: + tag: the tag for which to search + Returns: + A set containing docids with the provided tag which + may be empty.""" return self.docids_by_tag[tag] def get_docids_by_searching_tags(self, tag: str) -> Set[str]: - """Return the set of docids with a tag that contains a str""" + """Return the set of docids with a tag that contains a str. + Args: + tag: the tag pattern for which to search + + Returns: + A set containing docids with tags that match the pattern + provided. e.g., if the arg was "foo" tags "football", "foobar", + and "food" all match. + """ ret = set() for search_tag in self.docids_by_tag: if tag in search_tag: @@ -159,48 +210,65 @@ class Corpus(object): """Return the set of docids that have a particular property no matter what that property's value. + Args: + key: the key value to search for. + + Returns: + A set of docids that contain the key (no matter what value) + which may be empty. """ return self.docids_with_property[key] def get_docids_by_property(self, key: str, value: str) -> Set[str]: """Return the set of docids that have a particular property with a - particular value.. + particular value. + + Args: + key: the key to search for + value: the value that key must have in order to match a doc. + Returns: + A set of docids that contain key with value which may be empty. """ return self.docids_by_property[(key, value)] def invert_docid_set(self, original: Set[str]) -> Set[str]: """Invert a set of docids.""" - - return set( - [ - docid - for docid in self.documents_by_docid.keys() - if docid not in original - ] - ) + return {docid for docid in self.documents_by_docid if docid not in original} def get_doc(self, docid: str) -> Optional[Document]: - """Given a docid, retrieve the previously added Document.""" + """Given a docid, retrieve the previously added Document. + + Args: + docid: the docid to retrieve + Returns: + The Document with docid or None to indicate no match. + """ return self.documents_by_docid.get(docid, None) def query(self, query: str) -> Optional[Set[str]]: """Query the corpus for documents that match a logical expression. - Returns a (potentially empty) set of docids for the matching - (previously added) documents or None on error. - e.g. + Args: + query: the logical query expressed using a simple language + that understands conjunction (and operator), disjunction + (or operator) and inversion (not operator) as well as + parenthesis. Here are some legal sample queries:: + + tag1 and tag2 and not tag3 - tag1 and tag2 and not tag3 + (tag1 or tag2) and (tag3 or tag4) - (tag1 or tag2) and (tag3 or tag4) + (tag1 and key2:value2) or (tag2 and key1:value1) - (tag1 and key2:value2) or (tag2 and key1:value1) + key:* - key:* + tag1 and key:* - tag1 and key:* + Returns: + A (potentially empty) set of docids for the matching + (previously added) documents or None on error. """ try: @@ -258,9 +326,7 @@ class Corpus(object): operation = Operation.from_token(token) operand_count = operation.num_operands() if len(node_stack) < operand_count: - raise ParseError( - f"Incorrect number of operations for {operation}" - ) + raise ParseError(f"Incorrect number of operations for {operation}") for _ in range(operation.num_operands()): args.append(node_stack.pop()) node = Node(corpus, operation, args) @@ -287,9 +353,7 @@ class Corpus(object): ok = True break if not ok: - raise ParseError( - "Unbalanced parenthesis in query expression" - ) + raise ParseError("Unbalanced parenthesis in query expression") # and, or, not else: @@ -352,37 +416,41 @@ class Node(object): try: key, value = tag.split(":") except ValueError as v: - raise ParseError( - f'Invalid key:value syntax at "{tag}"' - ) from v - if value == "*": - r = self.corpus.get_docids_with_property(key) + raise ParseError(f'Invalid key:value syntax at "{tag}"') from v + + if key == '*': + r = set() + for kv, s in self.corpus.docids_by_property.items(): + if value in ('*', kv[1]): + r.update(s) else: - r = self.corpus.get_docids_by_property(key, value) + if value == '*': + r = self.corpus.get_docids_with_property(key) + else: + r = self.corpus.get_docids_by_property(key, value) else: - r = self.corpus.get_docids_by_exact_tag(tag) + if tag == '*': + r = set() + for s in self.corpus.docids_by_tag.values(): + r.update(s) + else: + r = self.corpus.get_docids_by_exact_tag(tag) retval.update(r) else: raise ParseError(f"Unexpected query {tag}") elif self.op is Operation.DISJUNCTION: if len(evaled_operands) != 2: - raise ParseError( - "Operation.DISJUNCTION (or) expects two operands." - ) + raise ParseError("Operation.DISJUNCTION (or) expects two operands.") retval.update(evaled_operands[0]) retval.update(evaled_operands[1]) elif self.op is Operation.CONJUNCTION: if len(evaled_operands) != 2: - raise ParseError( - "Operation.CONJUNCTION (and) expects two operands." - ) + raise ParseError("Operation.CONJUNCTION (and) expects two operands.") retval.update(evaled_operands[0]) retval = retval.intersection(evaled_operands[1]) elif self.op is Operation.INVERSION: if len(evaled_operands) != 1: - raise ParseError( - "Operation.INVERSION (not) expects one operand." - ) + raise ParseError("Operation.INVERSION (not) expects one operand.") _ = evaled_operands[0] if isinstance(_, set): retval.update(self.corpus.invert_docid_set(_)) @@ -393,4 +461,5 @@ class Node(object): if __name__ == '__main__': import doctest + doctest.testmod()