diff --git a/datefinder/constants.py b/datefinder/constants.py index 8a1cfd4..dce1401 100644 --- a/datefinder/constants.py +++ b/datefinder/constants.py @@ -1,15 +1,16 @@ import regex as re -NUMBERS_PATTERN = r"first|second|third|fourth|fifth|sixth|seventh|eighth|nineth|tenth" +NUMBERS_PATTERN = r"seventh|second|fourth|eighth|nineth|first|third|fifth|sixth|tenth" POSITIONNAL_TOKENS = r"next|last" DIGITS_PATTERN = r"\d+" DIGITS_SUFFIXES = r"st|th|rd|nd" -DAYS_PATTERN = "monday|tuesday|wednesday|thursday|friday|saturday|sunday|mandag|tirsdag|onsdag|torsdag|fredag|lørdag|søndag|mon|tue|tues|wed|thu|thur|thurs|fri|sat|sun|man|tir|tirs|ons|tor|tors|fre|lør|søn" -MONTHS_PATTERN = r"january|february|march|april|may|june|july|august|september|october|november|december|enero|febrero|marzo|abril|mayo|junio|julio|agosto|septiembre|octubre|noviembre|diciembre|januar|februar|marts|april|maj|juni|juli|august|september|oktober|november|december|jan[\.\s]|ene[\.\s]|feb[\.\s]|mar[\.\s]|apr[\.\s]|abr[\.\s]|may[\.\s]|maj[\.\s]|jun[\.\s]|jul[\.\s]|aug[\.\s]|ago[\.\s]|sep[^A-Za-z]|sept[\.\s]|oct[\.\s]|okt[\.\s]|nov[\.\s]|dec[\.\s]|dic[\.\s]" -TIMEZONES_PATTERN = "ACDT|ACST|ACT|ACWDT|ACWST|ADDT|ADMT|ADT|AEDT|AEST|AFT|AHDT|AHST|AKDT|AKST|AKTST|AKTT|ALMST|ALMT|AMST|AMT|ANAST|ANAT|ANT|APT|AQTST|AQTT|ARST|ART|ASHST|ASHT|AST|AWDT|AWST|AWT|AZOMT|AZOST|AZOT|AZST|AZT|BAKST|BAKT|BDST|BDT|BEAT|BEAUT|BIOT|BMT|BNT|BORT|BOST|BOT|BRST|BRT|BST|BTT|BURT|CANT|CAPT|CAST|CAT|CAWT|CCT|CDDT|CDT|CEDT|CEMT|CEST|CET|CGST|CGT|CHADT|CHAST|CHDT|CHOST|CHOT|CIST|CKHST|CKT|CLST|CLT|CMT|COST|COT|CPT|CST|CUT|CVST|CVT|CWT|CXT|ChST|DACT|DAVT|DDUT|DFT|DMT|DUSST|DUST|EASST|EAST|EAT|ECT|EDDT|EDT|EEDT|EEST|EET|EGST|EGT|EHDT|EMT|EPT|EST|ET|EWT|FET|FFMT|FJST|FJT|FKST|FKT|FMT|FNST|FNT|FORT|FRUST|FRUT|GALT|GAMT|GBGT|GEST|GET|GFT|GHST|GILT|GIT|GMT|GST|GYT|HAA|HAC|HADT|HAE|HAP|HAR|HAST|HAT|HAY|HDT|HKST|HKT|HLV|HMT|HNA|HNC|HNE|HNP|HNR|HNT|HNY|HOVST|HOVT|HST|ICT|IDDT|IDT|IHST|IMT|IOT|IRDT|IRKST|IRKT|IRST|ISST|IST|JAVT|JCST|JDT|JMT|JST|JWST|KART|KDT|KGST|KGT|KIZST|KIZT|KMT|KOST|KRAST|KRAT|KST|KUYST|KUYT|KWAT|LHDT|LHST|LINT|LKT|LMT|LMT|LMT|LMT|LRT|LST|MADMT|MADST|MADT|MAGST|MAGT|MALST|MALT|MART|MAWT|MDDT|MDST|MDT|MEST|MET|MHT|MIST|MIT|MMT|MOST|MOT|MPT|MSD|MSK|MSM|MST|MUST|MUT|MVT|MWT|MYT|NCST|NCT|NDDT|NDT|NEGT|NEST|NET|NFT|NMT|NOVST|NOVT|NPT|NRT|NST|NT|NUT|NWT|NZDT|NZMT|NZST|OMSST|OMST|ORAST|ORAT|PDDT|PDT|PEST|PET|PETST|PETT|PGT|PHOT|PHST|PHT|PKST|PKT|PLMT|PMDT|PMMT|PMST|PMT|PNT|PONT|PPMT|PPT|PST|PT|PWT|PYST|PYT|QMT|QYZST|QYZT|RET|RMT|ROTT|SAKST|SAKT|SAMT|SAST|SBT|SCT|SDMT|SDT|SET|SGT|SHEST|SHET|SJMT|SLT|SMT|SRET|SRT|SST|STAT|SVEST|SVET|SWAT|SYOT|TAHT|TASST|TAST|TBIST|TBIT|TBMT|TFT|THA|TJT|TKT|TLT|TMT|TOST|TOT|TRST|TRT|TSAT|TVT|ULAST|ULAT|URAST|URAT|UTC|UYHST|UYST|UYT|UZST|UZT|VET|VLAST|VLAT|VOLST|VOLT|VOST|VUST|VUT|WARST|WART|WAST|WAT|WDT|WEDT|WEMT|WEST|WET|WFT|WGST|WGT|WIB|WIT|WITA|WMT|WSDT|WSST|WST|WT|XJT|YAKST|YAKT|YAPT|YDDT|YDT|YEKST|YEKST|YEKT|YEKT|YERST|YERT|YPT|YST|YWT|zzz" +DAYS_PATTERN = "wednesday|thursday|saturday|tuesday|tirsdag|torsdag|monday|friday|sunday|mandag|onsdag|fredag|lørdag|søndag|thurs|tues|thur|tirs|tors|mon|tue|wed|thu|fri|sat|sun|man|tir|ons|tor|fre|lør|søn" +MONTHS_PATTERN = r"septiembre|september|noviembre|diciembre|september|february|november|december|november|december|january|october|febrero|octubre|februar|oktober|august|agosto|januar|august|march|april|enero|marzo|abril|junio|julio|marts|april|sept[\.\s]|june|july|mayo|juni|juli|jan[\.\s]|ene[\.\s]|feb[\.\s]|mar[\.\s]|apr[\.\s]|abr[\.\s]|may[\.\s]|maj[\.\s]|jun[\.\s]|jul[\.\s]|aug[\.\s]|ago[\.\s]|sep[^A-Za-z]|oct[\.\s]|okt[\.\s]|nov[\.\s]|dec[\.\s]|dic[\.\s]|may|maj" +TIMEZONES_PATTERN = "ACWDT|ACWST|AKTST|ALMST|ANAST|AQTST|ASHST|AZOMT|AZOST|BAKST|BEAUT|CHADT|CHAST|CHOST|CKHST|DUSST|EASST|FRUST|HOVST|IRKST|KIZST|KRAST|KUYST|MADMT|MADST|MAGST|MALST|NOVST|OMSST|ORAST|PETST|QYZST|SAKST|SHEST|SVEST|TASST|TBIST|ULAST|URAST|UYHST|VLAST|VOLST|WARST|YAKST|YEKST|YEKST|YERST|ACDT|ACST|ADDT|ADMT|AEDT|AEST|AHDT|AHST|AKDT|AKST|AKTT|ALMT|AMST|ANAT|AQTT|ARST|ASHT|AWDT|AWST|AZOT|AZST|BAKT|BDST|BEAT|BIOT|BORT|BOST|BRST|BURT|CANT|CAPT|CAST|CAWT|CDDT|CEDT|CEMT|CEST|CGST|CHDT|CHOT|CIST|CLST|COST|CVST|ChST|DACT|DAVT|DDUT|DUST|EAST|EDDT|EEDT|EEST|EGST|EHDT|FFMT|FJST|FKST|FNST|FORT|FRUT|GALT|GAMT|GBGT|GEST|GHST|GILT|HADT|HAST|HKST|HOVT|IDDT|IHST|IRDT|IRKT|IRST|ISST|JAVT|JCST|JWST|KART|KGST|KIZT|KOST|KRAT|KUYT|KWAT|LHDT|LHST|LINT|MADT|MAGT|MALT|MART|MAWT|MDDT|MDST|MEST|MIST|MOST|MUST|NCST|NDDT|NEGT|NEST|NOVT|NZDT|NZMT|NZST|OMST|ORAT|PDDT|PEST|PETT|PHOT|PHST|PKST|PLMT|PMDT|PMMT|PMST|PONT|PPMT|PYST|QYZT|ROTT|SAKT|SAMT|SAST|SDMT|SHET|SJMT|SRET|STAT|SVET|SWAT|SYOT|TAHT|TAST|TBIT|TBMT|TOST|TRST|TSAT|ULAT|URAT|UYST|UZST|VLAT|VOLT|VOST|VUST|WART|WAST|WEDT|WEMT|WEST|WGST|WITA|WSDT|WSST|YAKT|YAPT|YDDT|YEKT|YEKT|YERT|ACT|ADT|AFT|AMT|ANT|APT|ART|AST|AWT|AZT|BDT|BMT|BNT|BOT|BRT|BST|BTT|CAT|CCT|CDT|CET|CGT|CKT|CLT|CMT|COT|CPT|CST|CUT|CVT|CWT|CXT|DFT|DMT|EAT|ECT|EDT|EET|EGT|EMT|EPT|EST|EWT|FET|FJT|FKT|FMT|FNT|GET|GFT|GIT|GMT|GST|GYT|HAA|HAC|HAE|HAP|HAR|HAT|HAY|HDT|HKT|HLV|HMT|HNA|HNC|HNE|HNP|HNR|HNT|HNY|HST|ICT|IDT|IMT|IOT|IST|JDT|JMT|JST|KDT|KGT|KMT|KST|LKT|LMT|LMT|LMT|LMT|LRT|LST|MDT|MET|MHT|MIT|MMT|MOT|MPT|MSD|MSK|MSM|MST|MUT|MVT|MWT|MYT|NCT|NDT|NET|NFT|NMT|NPT|NRT|NST|NUT|NWT|PDT|PET|PGT|PHT|PKT|PMT|PNT|PPT|PST|PWT|PYT|QMT|RET|RMT|SBT|SCT|SDT|SET|SGT|SLT|SMT|SRT|SST|TFT|THA|TJT|TKT|TLT|TMT|TOT|TRT|TVT|UTC|UYT|UZT|VET|VUT|WAT|WDT|WET|WFT|WGT|WIB|WIT|WMT|WST|XJT|YDT|YPT|YST|YWT|zzz|ET|NT|PT|WT" ## explicit north american timezones that get replaced NA_TIMEZONES_PATTERN = "pacific|eastern|mountain|central" -ALL_TIMEZONES_PATTERN = TIMEZONES_PATTERN + "|" + NA_TIMEZONES_PATTERN +# na timezones are longer +ALL_TIMEZONES_PATTERN = NA_TIMEZONES_PATTERN + "|" + TIMEZONES_PATTERN # Allows for straightforward datestamps e.g 2017, 201712, 20171223. Created with: # YYYYMM_PATTERN = '|'.join(['19\d\d'+'{:0>2}'.format(mon)+'|20\d\d'+'{:0>2}'.format(mon) for mon in range(1, 13)]) @@ -34,7 +35,7 @@ DELIMITERS_PATTERN = r"[/\:\-\,\.\s\_\+\@]+" TIME_PERIOD_PATTERN = r"a\.m\.|am|p\.m\.|pm" ## can be in date strings but not recognized by dateutils -EXTRA_TOKENS_PATTERN = r"due|by|on|during|standard|daylight|savings|time|date|dated|of|to|through|between|until|at|day" +EXTRA_TOKENS_PATTERN = r"standard|daylight|savings|through|between|during|dated|until|time|date|due|day|by|on|of|to|at" ## TODO: Get english numbers? ## http://www.rexegg.com/regex-trick-numbers-in-english.html