From e70297b7e29a07457d6c7195425e734a1290abeb Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 31 Jan 2022 12:00:11 -0800 Subject: [PATCH] Fix a couple of bugs in date parsing. --- dateparse/dateparse_utils.py | 253 ++++++++++++++--------------------- datetime_utils.py | 38 ++++-- 2 files changed, 129 insertions(+), 162 deletions(-) diff --git a/dateparse/dateparse_utils.py b/dateparse/dateparse_utils.py index 21fdb83..be5e1b5 100755 --- a/dateparse/dateparse_utils.py +++ b/dateparse/dateparse_utils.py @@ -21,7 +21,10 @@ import pytz import acl import bootstrap from datetime_utils import ( - TimeUnit, n_timeunits_from_base, datetime_to_date, date_to_datetime + TimeUnit, + n_timeunits_from_base, + datetime_to_date, + date_to_datetime, ) from dateparse.dateparse_utilsLexer import dateparse_utilsLexer # type: ignore from dateparse.dateparse_utilsListener import dateparse_utilsListener # type: ignore @@ -35,50 +38,46 @@ logger = logging.getLogger(__name__) def debug_parse(enter_or_exit_f: Callable[[Any, Any], None]): @functools.wraps(enter_or_exit_f) def debug_parse_wrapper(*args, **kwargs): - slf = args[0] + # slf = args[0] ctx = args[1] depth = ctx.depth() logger.debug( - ' ' * (depth-1) + - f'Entering {enter_or_exit_f.__name__} ({ctx.invokingState} / {ctx.exception})' + ' ' * (depth - 1) + + f'Entering {enter_or_exit_f.__name__} ({ctx.invokingState} / {ctx.exception})' ) for c in ctx.getChildren(): - logger.debug( - ' ' * (depth-1) + - f'{c} {type(c)}' - ) + logger.debug(' ' * (depth - 1) + f'{c} {type(c)}') retval = enter_or_exit_f(*args, **kwargs) return retval + return debug_parse_wrapper class ParseException(Exception): """An exception thrown during parsing because of unrecognized input.""" + def __init__(self, message: str) -> None: self.message = message class RaisingErrorListener(antlr4.DiagnosticErrorListener): """An error listener that raises ParseExceptions.""" - def syntaxError( - self, recognizer, offendingSymbol, line, column, msg, e - ): + + def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e): raise ParseException(msg) def reportAmbiguity( - self, recognizer, dfa, startIndex, stopIndex, exact, - ambigAlts, configs + self, recognizer, dfa, startIndex, stopIndex, exact, ambigAlts, configs ): pass def reportAttemptingFullContext( - self, recognizer, dfa, startIndex, stopIndex, conflictingAlts, - configs + self, recognizer, dfa, startIndex, stopIndex, conflictingAlts, configs ): pass def reportContextSensitivity( - self, recognizer, dfa, startIndex, stopIndex, prediction, configs + self, recognizer, dfa, startIndex, stopIndex, prediction, configs ): pass @@ -90,13 +89,10 @@ class RaisingErrorListener(antlr4.DiagnosticErrorListener): 'enter*', 'exit*', ], - denied_patterns=[ - 'enterEveryRule', - 'exitEveryRule' - ], + denied_patterns=['enterEveryRule', 'exitEveryRule'], order_to_check_allow_deny=acl.Order.DENY_ALLOW, - default_answer=False - ) + default_answer=False, + ), ) class DateParser(dateparse_utilsListener): PARSE_TYPE_SINGLE_DATE_EXPR = 1 @@ -104,11 +100,7 @@ class DateParser(dateparse_utilsListener): PARSE_TYPE_SINGLE_TIME_EXPR = 3 PARSE_TYPE_BASE_AND_OFFSET_TIME_EXPR = 4 - def __init__( - self, - *, - override_now_for_test_purposes = None - ) -> None: + def __init__(self, *, override_now_for_test_purposes=None) -> None: """C'tor. Passing a value to override_now_for_test_purposes can be used to force this instance to use a custom date/time for its idea of "now" so that the code can be more easily unittested. @@ -142,7 +134,7 @@ class DateParser(dateparse_utilsListener): 9: 30, 10: 31, 11: 30, - 12: 31 + 12: 31, } # N.B. day number is also synched with datetime_utils.TimeUnit values @@ -198,7 +190,7 @@ class DateParser(dateparse_utilsListener): This is the main entrypoint to this class for caller code. """ date_string = date_string.strip() - date_string = re.sub('\s+', ' ', date_string) + date_string = re.sub(r'\s+', ' ', date_string) self._reset() listener = RaisingErrorListener() input_stream = antlr4.InputStream(date_string) @@ -246,9 +238,7 @@ class DateParser(dateparse_utilsListener): self.today = datetime.date.today() else: self.now_datetime = self.override_now_for_test_purposes - self.today = datetime_to_date( - self.override_now_for_test_purposes - ) + self.today = datetime_to_date(self.override_now_for_test_purposes) self.date: Optional[datetime.date] = None self.time: Optional[datetime.time] = None self.datetime: Optional[datetime.datetime] = None @@ -275,16 +265,16 @@ class DateParser(dateparse_utilsListener): return TimeUnit.MONTHS txt = orig.lower()[:3] if txt in self.day_name_to_number: - return(TimeUnit(self.day_name_to_number[txt])) + return TimeUnit(self.day_name_to_number[txt]) elif txt in self.delta_unit_to_constant: - return(TimeUnit(self.delta_unit_to_constant[txt])) + return TimeUnit(self.delta_unit_to_constant[txt]) raise ParseException(f'Invalid date unit: {orig}') def _figure_out_time_unit(self, orig: str) -> int: """Figure out what unit a time expression piece is talking about.""" txt = orig.lower()[:3] if txt in self.time_delta_unit_to_constant: - return(self.time_delta_unit_to_constant[txt]) + return self.time_delta_unit_to_constant[txt] raise ParseException(f'Invalid time unit: {orig}') def _parse_special_date(self, name: str) -> Optional[datetime.date]: @@ -317,13 +307,9 @@ class DateParser(dateparse_utilsListener): elif name == 'hallo': return datetime.date(year=year, month=10, day=31) - for holiday_date, holiday_name in sorted( - holidays.US(years=year).items() - ): + for holiday_date, holiday_name in sorted(holidays.US(years=year).items()): if 'Observed' not in holiday_name: - holiday_name = DateParser._normalize_special_day_name( - holiday_name - ) + holiday_name = DateParser._normalize_special_day_name(holiday_name) if name == holiday_name: return holiday_date if name == 'chriseve': @@ -373,10 +359,7 @@ class DateParser(dateparse_utilsListener): self.saw_overt_year = True # Handling "ides" and "nones" requires both the day and month. - if ( - self.context['day'] == 'ide' or - self.context['day'] == 'non' - ): + if self.context['day'] == 'ide' or self.context['day'] == 'non': self.context['day'] = self._resolve_ides_nones( self.context['day'], self.context['month'] ) @@ -452,8 +435,7 @@ class DateParser(dateparse_utilsListener): micros = self.time.microsecond self.datetime = datetime.datetime( - year, month, day, hour, minute, second, micros, - tzinfo=self.time.tzinfo + year, month, day, hour, minute, second, micros, tzinfo=self.time.tzinfo ) # Apply resudual adjustments to times here when we have a @@ -464,7 +446,7 @@ class DateParser(dateparse_utilsListener): self.datetime.minute, self.datetime.second, self.datetime.microsecond, - self.datetime.tzinfo + self.datetime.tzinfo, ) def enterDateExpr(self, ctx: dateparse_utilsParser.DateExprContext): @@ -505,10 +487,10 @@ class DateParser(dateparse_utilsListener): if 'delta_before_after' in self.context: before_after = self.context['delta_before_after'].lower() if ( - before_after == 'before' or - before_after == 'until' or - before_after == 'til' or - before_after == 'to' + before_after == 'before' + or before_after == 'until' + or before_after == 'til' + or before_after == 'to' ): count = -count @@ -516,11 +498,7 @@ class DateParser(dateparse_utilsListener): if 'delta_unit' not in self.context: raise ParseException('Missing delta_unit?!') unit = self.context['delta_unit'] - dt = n_timeunits_from_base( - count, - TimeUnit(unit), - date_to_datetime(self.date) - ) + dt = n_timeunits_from_base(count, TimeUnit(unit), date_to_datetime(self.date)) self.date = datetime_to_date(dt) def exitTimeExpr(self, ctx: dateparse_utilsParser.TimeExprContext) -> None: @@ -550,10 +528,10 @@ class DateParser(dateparse_utilsListener): if 'time_delta_before_after' in self.context: before_after = self.context['time_delta_before_after'].lower() if ( - before_after == 'before' or - before_after == 'until' or - before_after == 'til' or - before_after == 'to' + before_after == 'before' + or before_after == 'until' + or before_after == 'til' + or before_after == 'to' ): count = -count @@ -577,23 +555,17 @@ class DateParser(dateparse_utilsListener): try: n = ctx.nth() if n is None: - raise ParseException( - f'Bad N in Delta +/- Expr: {ctx.getText()}' - ) + raise ParseException(f'Bad N in Delta +/- Expr: {ctx.getText()}') n = n.getText() n = self._get_int(n) - unit = self._figure_out_date_unit( - ctx.deltaUnit().getText().lower() - ) + unit = self._figure_out_date_unit(ctx.deltaUnit().getText().lower()) except Exception: raise ParseException(f'Invalid Delta +/-: {ctx.getText()}') else: self.context['delta_int'] = n self.context['delta_unit'] = unit - def exitNextLastUnit( - self, ctx: dateparse_utilsParser.DeltaUnitContext - ) -> None: + def exitNextLastUnit(self, ctx: dateparse_utilsParser.DeltaUnitContext) -> None: try: unit = self._figure_out_date_unit(ctx.getText().lower()) except Exception: @@ -602,17 +574,13 @@ class DateParser(dateparse_utilsListener): self.context['delta_unit'] = unit def exitDeltaNextLast( - self, ctx: dateparse_utilsParser.DeltaNextLastContext + self, ctx: dateparse_utilsParser.DeltaNextLastContext ) -> None: try: txt = ctx.getText().lower() except Exception: raise ParseException(f'Bad next/last: {ctx.getText()}') - if ( - 'month' in self.context or - 'day' in self.context or - 'year' in self.context - ): + if 'month' in self.context or 'day' in self.context or 'year' in self.context: raise ParseException( 'Next/last expression expected to be relative to today.' ) @@ -635,36 +603,26 @@ class DateParser(dateparse_utilsListener): self, ctx: dateparse_utilsParser.CountUnitsBeforeAfterTimeExprContext ) -> None: if 'nth' not in self.context: - raise ParseException( - f'Bad count expression: {ctx.getText()}' - ) + raise ParseException(f'Bad count expression: {ctx.getText()}') try: - unit = self._figure_out_time_unit( - ctx.deltaTimeUnit().getText().lower() - ) + unit = self._figure_out_time_unit(ctx.deltaTimeUnit().getText().lower()) self.context['time_delta_unit'] = unit except Exception: raise ParseException(f'Bad delta unit: {ctx.getText()}') if 'time_delta_before_after' not in self.context: - raise ParseException( - f'Bad Before/After: {ctx.getText()}' - ) + raise ParseException(f'Bad Before/After: {ctx.getText()}') def exitDeltaTimeFraction( - self, ctx: dateparse_utilsParser.DeltaTimeFractionContext + self, ctx: dateparse_utilsParser.DeltaTimeFractionContext ) -> None: try: txt = ctx.getText().lower()[:4] if txt == 'quar': self.context['time_delta_int'] = 15 - self.context[ - 'time_delta_unit' - ] = TimeUnit.MINUTES + self.context['time_delta_unit'] = TimeUnit.MINUTES elif txt == 'half': self.context['time_delta_int'] = 30 - self.context[ - 'time_delta_unit' - ] = TimeUnit.MINUTES + self.context['time_delta_unit'] = TimeUnit.MINUTES else: raise ParseException(f'Bad time fraction {ctx.getText()}') except Exception: @@ -712,9 +670,7 @@ class DateParser(dateparse_utilsListener): year = self.context.get('year', self.today.year) if 'month' not in self.context: - raise ParseException( - f'Missing month expression: {ctx.getText()}' - ) + raise ParseException(f'Missing month expression: {ctx.getText()}') month = self.context['month'] dow = self.context['dow'] @@ -746,9 +702,7 @@ class DateParser(dateparse_utilsListener): self.context['day'] = 1 self.main_type = DateParser.PARSE_TYPE_BASE_AND_OFFSET_EXPR except Exception: - raise ParseException( - f'Invalid nthWeekday expression: {ctx.getText()}' - ) + raise ParseException(f'Invalid nthWeekday expression: {ctx.getText()}') def exitFirstLastWeekdayInMonthMaybeYearExpr( self, @@ -764,9 +718,7 @@ class DateParser(dateparse_utilsListener): else: self.context['nth'] = i - def exitFirstOrLast( - self, ctx: dateparse_utilsParser.FirstOrLastContext - ) -> None: + def exitFirstOrLast(self, ctx: dateparse_utilsParser.FirstOrLastContext) -> None: try: txt = ctx.getText() if txt == 'first': @@ -774,9 +726,7 @@ class DateParser(dateparse_utilsListener): elif txt == 'last': txt = -1 else: - raise ParseException( - f'Bad first|last expression: {ctx.getText()}' - ) + raise ParseException(f'Bad first|last expression: {ctx.getText()}') except Exception: raise ParseException(f'Bad first|last expression: {ctx.getText()}') else: @@ -791,9 +741,7 @@ class DateParser(dateparse_utilsListener): else: self.context['dow'] = dow - def exitDayOfMonth( - self, ctx: dateparse_utilsParser.DayOfMonthContext - ) -> None: + def exitDayOfMonth(self, ctx: dateparse_utilsParser.DayOfMonthContext) -> None: try: day = ctx.getText().lower() if day[:3] == 'ide': @@ -807,16 +755,12 @@ class DateParser(dateparse_utilsListener): return day = self._get_int(day) if day < 1 or day > 31: - raise ParseException( - f'Bad dayOfMonth expression: {ctx.getText()}' - ) + raise ParseException(f'Bad dayOfMonth expression: {ctx.getText()}') except Exception: raise ParseException(f'Bad dayOfMonth expression: {ctx.getText()}') self.context['day'] = day - def exitMonthName( - self, ctx: dateparse_utilsParser.MonthNameContext - ) -> None: + def exitMonthName(self, ctx: dateparse_utilsParser.MonthNameContext) -> None: try: month = ctx.getText() while month[0] == '/' or month[0] == '-': @@ -824,27 +768,19 @@ class DateParser(dateparse_utilsListener): month = month[:3].lower() month = self.month_name_to_number.get(month, None) if month is None: - raise ParseException( - f'Bad monthName expression: {ctx.getText()}' - ) + raise ParseException(f'Bad monthName expression: {ctx.getText()}') except Exception: raise ParseException(f'Bad monthName expression: {ctx.getText()}') else: self.context['month'] = month - def exitMonthNumber( - self, ctx: dateparse_utilsParser.MonthNumberContext - ) -> None: + def exitMonthNumber(self, ctx: dateparse_utilsParser.MonthNumberContext) -> None: try: month = self._get_int(ctx.getText()) if month < 1 or month > 12: - raise ParseException( - f'Bad monthNumber expression: {ctx.getText()}' - ) + raise ParseException(f'Bad monthNumber expression: {ctx.getText()}') except Exception: - raise ParseException( - f'Bad monthNumber expression: {ctx.getText()}' - ) + raise ParseException(f'Bad monthNumber expression: {ctx.getText()}') else: self.context['month'] = month @@ -879,9 +815,7 @@ class DateParser(dateparse_utilsListener): elif mod.LAST() is not None: self.context['special_next_last'] = 'last' except Exception: - raise ParseException( - f'Bad specialDateNextLast expression: {ctx.getText()}' - ) + raise ParseException(f'Bad specialDateNextLast expression: {ctx.getText()}') def exitNFoosFromTodayAgoExpr( self, ctx: dateparse_utilsParser.NFoosFromTodayAgoExprContext @@ -892,18 +826,13 @@ class DateParser(dateparse_utilsListener): unit = ctx.deltaUnit().getText().lower() ago_from_now = ctx.AGO_FROM_NOW().getText() except Exception: - raise ParseException( - f'Bad NFoosFromTodayAgoExpr: {ctx.getText()}' - ) + raise ParseException(f'Bad NFoosFromTodayAgoExpr: {ctx.getText()}') if "ago" in ago_from_now or "back" in ago_from_now: count = -count unit = self._figure_out_date_unit(unit) - d = n_timeunits_from_base( - count, - TimeUnit(unit), - d) + d = n_timeunits_from_base(count, TimeUnit(unit), d) self.context['year'] = d.year self.context['month'] = d.month self.context['day'] = d.day @@ -911,29 +840,50 @@ class DateParser(dateparse_utilsListener): def exitDeltaRelativeToTodayExpr( self, ctx: dateparse_utilsParser.DeltaRelativeToTodayExprContext ) -> None: + # When someone says "next week" they mean a week from now. + # Likewise next month or last year. These expressions are now + # +/- delta. + # + # But when someone says "this Friday" they mean "this coming + # Friday". It would be weird to say "this Friday" if today + # was already Friday but I'm parsing it to mean: the next day + # that is a Friday. So when you say "next Friday" you mean + # the Friday after this coming Friday, or 2 Fridays from now. + # + # This set handles this weirdness. + weekdays = set( + [ + TimeUnit.MONDAYS, + TimeUnit.TUESDAYS, + TimeUnit.WEDNESDAYS, + TimeUnit.THURSDAYS, + TimeUnit.FRIDAYS, + TimeUnit.SATURDAYS, + TimeUnit.SUNDAYS, + ] + ) d = self.now_datetime try: mod = ctx.thisNextLast() + unit = ctx.deltaUnit().getText().lower() + unit = self._figure_out_date_unit(unit) if mod.LAST(): count = -1 elif mod.THIS(): - count = +1 + if unit in weekdays: + count = +1 + else: + count = 0 elif mod.NEXT(): - count = +2 + if unit in weekdays: + count = +2 + else: + count = +1 else: - raise ParseException( - f'Bad This/Next/Last modifier: {mod}' - ) - unit = ctx.deltaUnit().getText().lower() + raise ParseException(f'Bad This/Next/Last modifier: {mod}') except Exception: - raise ParseException( - f'Bad DeltaRelativeToTodayExpr: {ctx.getText()}' - ) - unit = self._figure_out_date_unit(unit) - d = n_timeunits_from_base( - count, - TimeUnit(unit), - d) + raise ParseException(f'Bad DeltaRelativeToTodayExpr: {ctx.getText()}') + d = n_timeunits_from_base(count, TimeUnit(unit), d) self.context['year'] = d.year self.context['month'] = d.month self.context['day'] = d.day @@ -944,9 +894,7 @@ class DateParser(dateparse_utilsListener): try: txt = ctx.specialTime().getText().lower() except Exception: - raise ParseException( - f'Bad special time expression: {ctx.getText()}' - ) + raise ParseException(f'Bad special time expression: {ctx.getText()}') else: if txt == 'noon' or txt == 'midday': self.context['hour'] = 12 @@ -1076,6 +1024,7 @@ def main() -> None: try: dt = parser.parse(line) except Exception as e: + logger.exception(e) print("Unrecognized.") else: print(dt.strftime('%A %Y/%m/%d %H:%M:%S.%f %Z(%z)')) diff --git a/datetime_utils.py b/datetime_utils.py index 3565936..60b859a 100644 --- a/datetime_utils.py +++ b/datetime_utils.py @@ -304,6 +304,17 @@ def n_timeunits_from_base( >>> n_timeunits_from_base(50, TimeUnit.SECONDS, base) datetime.datetime(2021, 9, 10, 11, 25, 41, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))) + Next month corner case -- it will try to make Feb 31, 2022 then count + backwards. + >>> base = string_to_datetime("2022/01/31 11:24:51AM-0700")[0] + >>> n_timeunits_from_base(1, TimeUnit.MONTHS, base) + datetime.datetime(2022, 2, 28, 11, 24, 51, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))) + + Last month with the same corner case + >>> base = string_to_datetime("2022/03/31 11:24:51AM-0700")[0] + >>> n_timeunits_from_base(-1, TimeUnit.MONTHS, base) + datetime.datetime(2022, 2, 28, 11, 24, 51, tzinfo=datetime.timezone(datetime.timedelta(days=-1, seconds=61200))) + """ assert TimeUnit.is_valid(unit) if count == 0: @@ -364,16 +375,23 @@ def n_timeunits_from_base( new_month %= 12 year_term += 1 new_year = base.year + year_term - return datetime.datetime( - new_year, - new_month, - base.day, - base.hour, - base.minute, - base.second, - base.microsecond, - base.tzinfo, - ) + day = base.day + while True: + try: + ret = datetime.datetime( + new_year, + new_month, + day, + base.hour, + base.minute, + base.second, + base.microsecond, + base.tzinfo, + ) + break + except ValueError: + day -= 1 + return ret # N years from base elif unit == TimeUnit.YEARS: -- 2.45.2