projects
/
python_utils.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
More cleanup, yey!
[python_utils.git]
/
logical_search.py
diff --git
a/logical_search.py
b/logical_search.py
index 805ec223010b93b2a1bf68e1fdee9467daac14aa..b55e68901501ad2e68a7f45df034866072c3f61b 100644
(file)
--- a/
logical_search.py
+++ b/
logical_search.py
@@
-1,27
+1,22
@@
#!/usr/bin/env python3
#!/usr/bin/env python3
+"""This is a module concerned with the creation of and searching of a
+corpus of documents. The corpus is held in memory for fast
+searching."""
+
from __future__ import annotations
from __future__ import annotations
-from collections import defaultdict
import enum
import sys
import enum
import sys
-from typing import (
- Any,
- Dict,
- List,
- NamedTuple,
- Optional,
- Set,
- Sequence,
- Tuple,
- Union,
-)
+from collections import defaultdict
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Union
class ParseError(Exception):
"""An error encountered while parsing a logical search expression."""
def __init__(self, message: str):
class ParseError(Exception):
"""An error encountered while parsing a logical search expression."""
def __init__(self, message: str):
+ super().__init__()
self.message = message
self.message = message
@@
-30,9
+25,7
@@
class Document(NamedTuple):
docid: str # a unique idenfier for the document
tags: Set[str] # an optional set of tags
docid: str # a unique idenfier for the document
tags: Set[str] # an optional set of tags
- properties: List[
- Tuple[str, str]
- ] # an optional set of key->value properties
+ properties: List[Tuple[str, str]] # an optional set of key->value properties
reference: Any # an optional reference to something else
reference: Any # an optional reference to something else
@@
-86,15
+79,23
@@
class Corpus(object):
... reference=None,
... )
... )
... reference=None,
... )
... )
+ >>> c.add_doc(Document(
+ ... docid=3,
+ ... tags=set(['urgent']),
+ ... properties=[
+ ... ('author', 'Scott'),
+ ... ('subject', 'car turning in front of you')
+ ... ],
+ ... reference=None,
+ ... )
+ ... )
>>> c.query('author:Scott and important')
{1}
"""
def __init__(self) -> None:
self.docids_by_tag: Dict[str, Set[str]] = defaultdict(set)
>>> c.query('author:Scott and important')
{1}
"""
def __init__(self) -> None:
self.docids_by_tag: Dict[str, Set[str]] = defaultdict(set)
- self.docids_by_property: Dict[Tuple[str, str], Set[str]] = defaultdict(
- set
- )
+ self.docids_by_property: Dict[Tuple[str, str], Set[str]] = defaultdict(set)
self.docids_with_property: Dict[str, Set[str]] = defaultdict(set)
self.documents_by_docid: Dict[str, Document] = {}
self.docids_with_property: Dict[str, Set[str]] = defaultdict(set)
self.documents_by_docid: Dict[str, Document] = {}
@@
-172,13
+173,7
@@
class Corpus(object):
def invert_docid_set(self, original: Set[str]) -> Set[str]:
"""Invert a set of docids."""
def invert_docid_set(self, original: Set[str]) -> Set[str]:
"""Invert a set of docids."""
- return set(
- [
- docid
- for docid in self.documents_by_docid.keys()
- if docid not in original
- ]
- )
+ return set([docid for docid in self.documents_by_docid.keys() if docid not in original])
def get_doc(self, docid: str) -> Optional[Document]:
"""Given a docid, retrieve the previously added Document."""
def get_doc(self, docid: str) -> Optional[Document]:
"""Given a docid, retrieve the previously added Document."""
@@
-258,9
+253,7
@@
class Corpus(object):
operation = Operation.from_token(token)
operand_count = operation.num_operands()
if len(node_stack) < operand_count:
operation = Operation.from_token(token)
operand_count = operation.num_operands()
if len(node_stack) < operand_count:
- raise ParseError(
- f"Incorrect number of operations for {operation}"
- )
+ raise ParseError(f"Incorrect number of operations for {operation}")
for _ in range(operation.num_operands()):
args.append(node_stack.pop())
node = Node(corpus, operation, args)
for _ in range(operation.num_operands()):
args.append(node_stack.pop())
node = Node(corpus, operation, args)
@@
-287,9
+280,7
@@
class Corpus(object):
ok = True
break
if not ok:
ok = True
break
if not ok:
- raise ParseError(
- "Unbalanced parenthesis in query expression"
- )
+ raise ParseError("Unbalanced parenthesis in query expression")
# and, or, not
else:
# and, or, not
else:
@@
-352,9
+343,7
@@
class Node(object):
try:
key, value = tag.split(":")
except ValueError as v:
try:
key, value = tag.split(":")
except ValueError as v:
- raise ParseError(
- f'Invalid key:value syntax at "{tag}"'
- ) from v
+ raise ParseError(f'Invalid key:value syntax at "{tag}"') from v
if value == "*":
r = self.corpus.get_docids_with_property(key)
else:
if value == "*":
r = self.corpus.get_docids_with_property(key)
else:
@@
-366,23
+355,17
@@
class Node(object):
raise ParseError(f"Unexpected query {tag}")
elif self.op is Operation.DISJUNCTION:
if len(evaled_operands) != 2:
raise ParseError(f"Unexpected query {tag}")
elif self.op is Operation.DISJUNCTION:
if len(evaled_operands) != 2:
- raise ParseError(
- "Operation.DISJUNCTION (or) expects two operands."
- )
+ raise ParseError("Operation.DISJUNCTION (or) expects two operands.")
retval.update(evaled_operands[0])
retval.update(evaled_operands[1])
elif self.op is Operation.CONJUNCTION:
if len(evaled_operands) != 2:
retval.update(evaled_operands[0])
retval.update(evaled_operands[1])
elif self.op is Operation.CONJUNCTION:
if len(evaled_operands) != 2:
- raise ParseError(
- "Operation.CONJUNCTION (and) expects two operands."
- )
+ raise ParseError("Operation.CONJUNCTION (and) expects two operands.")
retval.update(evaled_operands[0])
retval = retval.intersection(evaled_operands[1])
elif self.op is Operation.INVERSION:
if len(evaled_operands) != 1:
retval.update(evaled_operands[0])
retval = retval.intersection(evaled_operands[1])
elif self.op is Operation.INVERSION:
if len(evaled_operands) != 1:
- raise ParseError(
- "Operation.INVERSION (not) expects one operand."
- )
+ raise ParseError("Operation.INVERSION (not) expects one operand.")
_ = evaled_operands[0]
if isinstance(_, set):
retval.update(self.corpus.invert_docid_set(_))
_ = evaled_operands[0]
if isinstance(_, set):
retval.update(self.corpus.invert_docid_set(_))
@@
-393,4
+376,5
@@
class Node(object):
if __name__ == '__main__':
import doctest
if __name__ == '__main__':
import doctest
+
doctest.testmod()
doctest.testmod()