-
Notifications
You must be signed in to change notification settings - Fork 137
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Python 3 version of timex library #26
base: master
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,15 @@ | ||
# Code for tagging temporal expressions in text | ||
# For details of the TIMEX format, see http://timex2.mitre.org/ | ||
# Converted to Python3 by Brian Hockenmaier in 2019 | ||
|
||
import re | ||
import string | ||
import os | ||
import sys | ||
from datetime import datetime, timedelta | ||
|
||
# Requires eGenix.com mx Base Distribution | ||
# Python3 version no longer requires eGenix.com mx Base Distribution | ||
# http://www.egenix.com/products/python/mxBase/ | ||
try: | ||
from mx.DateTime import * | ||
except ImportError: | ||
print """ | ||
Requires eGenix.com mx Base Distribution | ||
http://www.egenix.com/products/python/mxBase/""" | ||
|
||
# Predefined strings. | ||
numbers = "(^a(?=\s)|one|two|three|four|five|six|seven|eight|nine|ten| \ | ||
|
@@ -81,29 +77,29 @@ def tag(text): | |
# Hash function for week days to simplify the grounding task. | ||
# [Mon..Sun] -> [0..6] | ||
hashweekdays = { | ||
'Monday': 0, | ||
'Tuesday': 1, | ||
'Wednesday': 2, | ||
'Thursday': 3, | ||
'Friday': 4, | ||
'Saturday': 5, | ||
'Sunday': 6} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These capitalized day and month names were not working in every scenario for me. By converting to lowercase for input text these issues were resolved. |
||
'monday': 0, | ||
'tuesday': 1, | ||
'wednesday': 2, | ||
'thursday': 3, | ||
'friday': 4, | ||
'saturday': 5, | ||
'sunday': 6} | ||
|
||
# Hash function for months to simplify the grounding task. | ||
# [Jan..Dec] -> [1..12] | ||
hashmonths = { | ||
'January': 1, | ||
'February': 2, | ||
'March': 3, | ||
'April': 4, | ||
'May': 5, | ||
'June': 6, | ||
'July': 7, | ||
'August': 8, | ||
'September': 9, | ||
'October': 10, | ||
'November': 11, | ||
'December': 12} | ||
'january': 1, | ||
'february': 2, | ||
'march': 3, | ||
'april': 4, | ||
'may': 5, | ||
'june': 6, | ||
'july': 7, | ||
'august': 8, | ||
'september': 9, | ||
'october': 10, | ||
'november': 11, | ||
'december': 12} | ||
|
||
# Hash number in words into the corresponding integer value | ||
def hashnum(number): | ||
|
@@ -175,9 +171,14 @@ def ground(tagged_text, base_date): | |
timex_found = timex_regex.findall(tagged_text) | ||
timex_found = map(lambda timex:re.sub(r'</?TIMEX2.*?>', '', timex), \ | ||
timex_found) | ||
timexList = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This new variable is used to return timex values as a list in addition to the timex tagged format. |
||
|
||
# Calculate the new date accordingly | ||
for timex in timex_found: | ||
# global month | ||
month = "(january|february|march|april|may|june|july|august|september| \ | ||
october|november|december)" | ||
|
||
Comment on lines
+179
to
+181
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a bug in the original version here where the global variable month was overwritten and then unusable on subsequent calls to timex. Adding month in here solves the issue. |
||
timex_val = 'UNKNOWN' # Default value | ||
|
||
timex_ori = timex # Backup original timex for later substitution | ||
|
@@ -191,7 +192,7 @@ def ground(tagged_text, base_date): | |
unit = split_timex[1] | ||
num_list = map(lambda s:hashnum(s),re.findall(numbers + '+', \ | ||
value, re.IGNORECASE)) | ||
timex = `sum(num_list)` + ' ' + unit | ||
timex = sum(num_list) + ' ' + unit | ||
|
||
# If timex matches ISO format, remove 'time' and reorder 'date' | ||
if re.match(r'\d+[/-]\d+[/-]\d+ \d+:\d+:\d+\.\d+', timex): | ||
|
@@ -207,43 +208,46 @@ def ground(tagged_text, base_date): | |
elif re.match(r'tonight|tonite|today', timex, re.IGNORECASE): | ||
timex_val = str(base_date) | ||
elif re.match(r'yesterday', timex, re.IGNORECASE): | ||
timex_val = str(base_date + RelativeDateTime(days=-1)) | ||
timex_val = str(base_date + timedelta(days=-1)) | ||
elif re.match(r'tomorrow', timex, re.IGNORECASE): | ||
timex_val = str(base_date + RelativeDateTime(days=+1)) | ||
timex_val = str(base_date + timedelta(days=+1)) | ||
|
||
# Weekday in the previous week. | ||
elif re.match(r'last ' + week_day, timex, re.IGNORECASE): | ||
day = hashweekdays[timex.split()[1]] | ||
timex_val = str(base_date + RelativeDateTime(weeks=-1, \ | ||
weekday=(day,0))) | ||
target_day = hashweekdays[timex.split()[1]] | ||
monday_of_base_week = base_date - timedelta(days=base_date.weekday()) | ||
monday_of_target_week = base_date + timedelta(weeks=-1) | ||
timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) | ||
|
||
# Weekday in the current week. | ||
elif re.match(r'this ' + week_day, timex, re.IGNORECASE): | ||
day = hashweekdays[timex.split()[1]] | ||
timex_val = str(base_date + RelativeDateTime(weeks=0, \ | ||
weekday=(day,0))) | ||
target_day = hashweekdays[timex.split()[1]] | ||
monday_of_base_week = base_date - timedelta(days=base_date.weekday()) | ||
monday_of_target_week = base_date + timedelta(weeks=0) | ||
timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) | ||
|
||
# Weekday in the following week. | ||
elif re.match(r'next ' + week_day, timex, re.IGNORECASE): | ||
day = hashweekdays[timex.split()[1]] | ||
timex_val = str(base_date + RelativeDateTime(weeks=+1, \ | ||
weekday=(day,0))) | ||
target_day = hashweekdays[timex.split()[1]] | ||
monday_of_base_week = base_date - timedelta(days=base_date.weekday()) | ||
monday_of_target_week = base_date + timedelta(weeks=+1) | ||
timex_val = str(monday_of_target_week + timedelta(days=target_day+1)) | ||
|
||
# Last, this, next week. | ||
elif re.match(r'last week', timex, re.IGNORECASE): | ||
year = (base_date + RelativeDateTime(weeks=-1)).year | ||
year = (base_date + timedelta(weeks=-1)).year | ||
|
||
# iso_week returns a triple (year, week, day) hence, retrieve | ||
# only week value. | ||
week = (base_date + RelativeDateTime(weeks=-1)).iso_week[1] | ||
week = (base_date + timedelta(weeks=-1)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'this week', timex, re.IGNORECASE): | ||
year = (base_date + RelativeDateTime(weeks=0)).year | ||
week = (base_date + RelativeDateTime(weeks=0)).iso_week[1] | ||
year = (base_date + timedelta(weeks=0)).year | ||
week = (base_date + timedelta(weeks=0)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'next week', timex, re.IGNORECASE): | ||
year = (base_date + RelativeDateTime(weeks=+1)).year | ||
week = (base_date + RelativeDateTime(weeks=+1)).iso_week[1] | ||
year = (base_date + timedelta(weeks=+1)).year | ||
week = (base_date + timedelta(weeks=+1)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
|
||
# Month in the previous year. | ||
|
@@ -286,20 +290,20 @@ def ground(tagged_text, base_date): | |
|
||
# Calculate the offset by taking '\d+' part from the timex. | ||
offset = int(re.split(r'\s', timex)[0]) | ||
timex_val = str(base_date + RelativeDateTime(days=-offset)) | ||
timex_val = str(base_date + timedelta(days=-offset)) | ||
elif re.match(r'\d+ days? (later|after)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
timex_val = str(base_date + RelativeDateTime(days=+offset)) | ||
timex_val = str(base_date + timedelta(days=+offset)) | ||
elif re.match(r'\d+ weeks? (ago|earlier|before)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
year = (base_date + RelativeDateTime(weeks=-offset)).year | ||
year = (base_date + timedelta(weeks=-offset)).year | ||
week = (base_date + \ | ||
RelativeDateTime(weeks=-offset)).iso_week[1] | ||
timedelta(weeks=-offset)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'\d+ weeks? (later|after)', timex, re.IGNORECASE): | ||
offset = int(re.split(r'\s', timex)[0]) | ||
year = (base_date + RelativeDateTime(weeks=+offset)).year | ||
week = (base_date + RelativeDateTime(weeks=+offset)).iso_week[1] | ||
year = (base_date + timedelta(weeks=+offset)).year | ||
week = (base_date + timedelta(weeks=+offset)).isocalendar()[1] | ||
timex_val = str(year) + 'W' + str(week) | ||
elif re.match(r'\d+ months? (ago|earlier|before)', timex, re.IGNORECASE): | ||
extra = 0 | ||
|
@@ -343,15 +347,20 @@ def ground(tagged_text, base_date): | |
# Substitute tag+timex in the text with grounded tag+timex. | ||
tagged_text = re.sub('<TIMEX2>' + timex_ori + '</TIMEX2>', '<TIMEX2 val=\"' \ | ||
+ timex_val + '\">' + timex_ori + '</TIMEX2>', tagged_text) | ||
|
||
timexList.append({ | ||
"text": timex_ori, | ||
"value": timex_val | ||
}) | ||
|
||
return tagged_text | ||
return tagged_text, timexList | ||
|
||
#### | ||
|
||
def demo(): | ||
import nltk | ||
text = nltk.corpus.abc.raw('rural.txt')[:10000] | ||
print tag(text) | ||
print(tag(text)) | ||
|
||
if __name__ == '__main__': | ||
demo() | ||
demo() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new timedelta features of python3 allow us to remove the dependency on mx.DateTime