import random
import re
import string
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
import unicodedata
from uuid import uuid4
+import list_utils
+
logger = logging.getLogger(__name__)
NUMBER_RE = re.compile(r"^([+\-]?)((\d+)(\.\d+)?([e|E]\d+)?|\.\d+)$")
"""
words = txt.split()
- return ngrams_presplit(words, n)
+ for ngram in ngrams_presplit(words, n):
+ ret = ''
+ for word in ngram:
+ ret += f'{word} '
+ yield ret.strip()
-def ngrams_presplit(words: Iterable[str], n: int):
- for ngram in zip(*[words[i:] for i in range(n)]):
- yield(' '.join(ngram))
+def ngrams_presplit(words: Sequence[str], n: int):
+ return list_utils.ngrams(words, n)
def bigrams(txt: str):