#!/usr/bin/env python3
-from __future__ import annotations
+# © Copyright 2021-2022, Scott Gasch
+
+"""This is a module concerned with the creation of and searching of a
+corpus of documents. The corpus is held in memory for fast
+searching.
+"""
+
+from __future__ import annotations
import enum
import sys
from collections import defaultdict
-from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Set, Tuple, Union
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
class ParseError(Exception):
"""An error encountered while parsing a logical search expression."""
def __init__(self, message: str):
+ super().__init__()
self.message = message
-class Document(NamedTuple):
- """A tuple representing a searchable document."""
+@dataclass
+class Document:
+ """A class representing a searchable document."""
+
+ # A unique identifier for each document.
+ docid: str = ''
+
+ # A set of tag strings for this document. May be empty.
+ tags: Set[str] = field(default_factory=set)
+
+ # A list of key->value strings for this document. May be empty.
+ properties: List[Tuple[str, str]] = field(default_factory=list)
- docid: str # a unique idenfier for the document
- tags: Set[str] # an optional set of tags
- properties: List[Tuple[str, str]] # an optional set of key->value properties
- reference: Any # an optional reference to something else
+ # An optional reference to something else; interpreted only by
+ # caller code, ignored here.
+ reference: Optional[Any] = None
class Operation(enum.Enum):
def invert_docid_set(self, original: Set[str]) -> Set[str]:
"""Invert a set of docids."""
- return set(
- [docid for docid in self.documents_by_docid.keys() if docid not in original]
- )
+ return {docid for docid in self.documents_by_docid if docid not in original}
def get_doc(self, docid: str) -> Optional[Document]:
"""Given a docid, retrieve the previously added Document."""
operation = Operation.from_token(token)
operand_count = operation.num_operands()
if len(node_stack) < operand_count:
- raise ParseError(
- f"Incorrect number of operations for {operation}"
- )
+ raise ParseError(f"Incorrect number of operations for {operation}")
for _ in range(operation.num_operands()):
args.append(node_stack.pop())
node = Node(corpus, operation, args)
try:
key, value = tag.split(":")
except ValueError as v:
- raise ParseError(
- f'Invalid key:value syntax at "{tag}"'
- ) from v
+ raise ParseError(f'Invalid key:value syntax at "{tag}"') from v
if value == "*":
r = self.corpus.get_docids_with_property(key)
else: