Date Extractor¶
A tool for identifying dates within a string of text.
Example:
date_extractor = DateExtractor(full_text=self.text_with_dates)
date_extractor.normalized_dates[0] # == '2020-01-02'
date_extractor.find_copyright_year() # == '2020-01-02'
-
class
sermos_tools.catalog.date_extractor.date_extractor.
DateExtractor
(full_text: str, raw_dates_as_dicts: bool = True, document_pages: List[str] = [], date_patterns: sermos_tools.catalog.date_extractor.date_extractor.DatePatterns = DatePatterns(MONTH_OPTIONS=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'}), MONTH_ABBR_MAP={'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}, MONTH_NAME_MAP={'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11, 'december': 12}, NUM_MDY_PATTERN=regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), NUM_MY_PATTERN=regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), STR_MDY_PATTERN=regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'})), STR_MY_PATTERN=regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'})), DATE_PATTERNS=[regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'})), regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'}))]))¶ Extract dates from provided full_text
- Usage:
extractor = DateExtractor(‘Your Full Text Here Jan 2, 2000’) extractor.normalized_dates –> [‘2000-01-02’]
-
class
sermos_tools.catalog.date_extractor.date_extractor.
DatePatterns
(MONTH_OPTIONS: frozenset = frozenset({'Apr', 'April', 'Aug', 'August', 'Dec', 'December', 'Feb', 'February', 'Jan', 'January', 'Jul', 'July', 'Jun', 'June', 'Mar', 'March', 'May', 'Nov', 'November', 'Oct', 'October', 'Sep', 'September'}), MONTH_ABBR_MAP: dict = {'apr': 4, 'aug': 8, 'dec': 12, 'feb': 2, 'jan': 1, 'jul': 7, 'jun': 6, 'mar': 3, 'may': 5, 'nov': 11, 'oct': 10, 'sep': 9}, MONTH_NAME_MAP: dict = {'april': 4, 'august': 8, 'december': 12, 'february': 2, 'january': 1, 'july': 7, 'june': 6, 'march': 3, 'may': 5, 'november': 11, 'october': 10, 'september': 9}, NUM_MDY_PATTERN: <module 'regex' from '/usr/local/lib/python3.7/site-packages/regex/__init__.py'> = regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), NUM_MY_PATTERN: <module 'regex' from '/usr/local/lib/python3.7/site-packages/regex/__init__.py'> = regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), STR_MDY_PATTERN: <module 'regex' from '/usr/local/lib/python3.7/site-packages/regex/__init__.py'> = regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'})), STR_MY_PATTERN: <module 'regex' from '/usr/local/lib/python3.7/site-packages/regex/__init__.py'> = regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'})), DATE_PATTERNS: list = [regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), regex.Regex('\\b(?P<month>0?[1-9]|1[0-2])\\s*[/\\.\\-]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.V0), regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s]\\s*(?P<day>0?[1-9]|[12]\\d|3[01])\\s*[/\\.\\-,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'})), regex.Regex('\\b(?P<month>\\L<month_options>)\\s*[/\\.\\-\\s,]\\s*(?P<year>\\d{4}|\\d{2})\\b', flags=regex.I | regex.V0, month_options=frozenset({'January', 'March', 'Jun', 'Aug', 'Mar', 'Oct', 'Sep', 'October', 'Dec', 'September', 'May', 'Jan', 'November', 'Apr', 'April', 'July', 'December', 'February', 'Feb', 'Jul', 'June', 'August', 'Nov'}))])¶ Common date patterns for regex-based searches against fulltext.