Improve wildcard semantics again.
[python_utils.git] / logical_search.py
index 41ed729fc15df80a4477d3094e4dc980869741aa..2f79db09d83dbe0b14e2ab6323e107f31384e148 100644 (file)
@@ -1,11 +1,14 @@
 #!/usr/bin/env python3
 
+# © Copyright 2021-2022, Scott Gasch
+
 """This is a module concerned with the creation of and searching of a
 corpus of documents.  The corpus is held in memory for fast
-searching."""
+searching.
 
-from __future__ import annotations
+"""
 
+from __future__ import annotations
 import enum
 import sys
 from collections import defaultdict
@@ -101,6 +104,12 @@ class Corpus(object):
     ...          )
     >>> c.query('author:Scott and important')
     {1}
+    >>> c.query('*')
+    {1, 2, 3}
+    >>> c.query('*:*')
+    {1, 2, 3}
+    >>> c.query('*:Scott')
+    {1, 3}
     """
 
     def __init__(self) -> None:
@@ -153,7 +162,6 @@ class Corpus(object):
 
     def get_docids_by_exact_tag(self, tag: str) -> Set[str]:
         """Return the set of docids that have a particular tag."""
-
         return self.docids_by_tag[tag]
 
     def get_docids_by_searching_tags(self, tag: str) -> Set[str]:
@@ -183,7 +191,7 @@ class Corpus(object):
     def invert_docid_set(self, original: Set[str]) -> Set[str]:
         """Invert a set of docids."""
 
-        return set([docid for docid in self.documents_by_docid.keys() if docid not in original])
+        return {docid for docid in self.documents_by_docid if docid not in original}
 
     def get_doc(self, docid: str) -> Optional[Document]:
         """Given a docid, retrieve the previously added Document."""
@@ -354,12 +362,24 @@ class Node(object):
                             key, value = tag.split(":")
                         except ValueError as v:
                             raise ParseError(f'Invalid key:value syntax at "{tag}"') from v
-                        if value == "*":
-                            r = self.corpus.get_docids_with_property(key)
+
+                        if key == '*':
+                            r = set()
+                            for kv, s in self.corpus.docids_by_property.items():
+                                if value in ('*', kv[1]):
+                                    r.update(s)
                         else:
-                            r = self.corpus.get_docids_by_property(key, value)
+                            if value == '*':
+                                r = self.corpus.get_docids_with_property(key)
+                            else:
+                                r = self.corpus.get_docids_by_property(key, value)
                     else:
-                        r = self.corpus.get_docids_by_exact_tag(tag)
+                        if tag == '*':
+                            r = set()
+                            for s in self.corpus.docids_by_tag.values():
+                                r.update(s)
+                        else:
+                            r = self.corpus.get_docids_by_exact_tag(tag)
                     retval.update(r)
                 else:
                     raise ParseError(f"Unexpected query {tag}")