X-Git-Url: https://wannabe.guru.org/gitweb/?a=blobdiff_plain;f=logical_search.py;h=0cfbc8d22624bffe9bc227e3ffde0abc0f313053;hb=24b9bcb2a4b74d2c242c8b4172f295b57c09b46d;hp=b55e68901501ad2e68a7f45df034866072c3f61b;hpb=e8fbbb7306430478dec55d2c963eed116d8330cc;p=python_utils.git diff --git a/logical_search.py b/logical_search.py index b55e689..0cfbc8d 100644 --- a/logical_search.py +++ b/logical_search.py @@ -1,15 +1,19 @@ #!/usr/bin/env python3 +# © Copyright 2021-2022, Scott Gasch + """This is a module concerned with the creation of and searching of a corpus of documents. The corpus is held in memory for fast -searching.""" +searching. -from __future__ import annotations +""" +from __future__ import annotations import enum import sys from collections import defaultdict -from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Union +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union class ParseError(Exception): @@ -20,13 +24,22 @@ class ParseError(Exception): self.message = message -class Document(NamedTuple): - """A tuple representing a searchable document.""" +@dataclass +class Document: + """A class representing a searchable document.""" + + # A unique identifier for each document. + docid: str = '' - docid: str # a unique idenfier for the document - tags: Set[str] # an optional set of tags - properties: List[Tuple[str, str]] # an optional set of key->value properties - reference: Any # an optional reference to something else + # A set of tag strings for this document. May be empty. + tags: Set[str] = field(default_factory=set) + + # A list of key->value strings for this document. May be empty. + properties: List[Tuple[str, str]] = field(default_factory=list) + + # An optional reference to something else; interpreted only by + # caller code, ignored here. + reference: Optional[Any] = None class Operation(enum.Enum): @@ -91,6 +104,10 @@ class Corpus(object): ... ) >>> c.query('author:Scott and important') {1} + >>> c.query('*') + {1, 2, 3} + >>> c.query('*:*') + {1, 2, 3} """ def __init__(self) -> None: @@ -143,7 +160,6 @@ class Corpus(object): def get_docids_by_exact_tag(self, tag: str) -> Set[str]: """Return the set of docids that have a particular tag.""" - return self.docids_by_tag[tag] def get_docids_by_searching_tags(self, tag: str) -> Set[str]: @@ -173,7 +189,7 @@ class Corpus(object): def invert_docid_set(self, original: Set[str]) -> Set[str]: """Invert a set of docids.""" - return set([docid for docid in self.documents_by_docid.keys() if docid not in original]) + return {docid for docid in self.documents_by_docid if docid not in original} def get_doc(self, docid: str) -> Optional[Document]: """Given a docid, retrieve the previously added Document.""" @@ -344,12 +360,22 @@ class Node(object): key, value = tag.split(":") except ValueError as v: raise ParseError(f'Invalid key:value syntax at "{tag}"') from v - if value == "*": - r = self.corpus.get_docids_with_property(key) + if key == '*': + r = set() + for s in self.corpus.docids_by_tag.values(): + r.update(s) else: - r = self.corpus.get_docids_by_property(key, value) + if value == '*': + r = self.corpus.get_docids_with_property(key) + else: + r = self.corpus.get_docids_by_property(key, value) else: - r = self.corpus.get_docids_by_exact_tag(tag) + if tag == '*': + r = set() + for s in self.corpus.docids_by_tag.values(): + r.update(s) + else: + r = self.corpus.get_docids_by_exact_tag(tag) retval.update(r) else: raise ParseError(f"Unexpected query {tag}")