Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable summarizers #14

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions interval_summarizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ def report_summary(self, messages):
"""The interval summaries are joined."""
return '\n'.join(self.summarize(messages))

def set_interval(self, ispecs):
self.intervals = map(lambda ispec: IntervalSpec(**ispec), ispecs)

def summarize_segment(self, msg_seg):
"""Call the summarizer that is used."""
return msg_seg
Expand Down
13 changes: 7 additions & 6 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@
from config import *
from ts_config import SUMMS
from slack_summary import SlackRouter
import lsa
import spacy.en
import spacy
app = Flask(__name__)
global lsa_summ
from utils import maybe_get
global lsa_summ
lsa_summ = None
if "spacy" in SUMMS:
import lsa
import spacy.en
import spacy
lsa_summ = lsa.LsaSummarizer()


@app.route("/slack", methods=['POST'])
def slackReq():
global lsa_summ
Expand All @@ -30,7 +31,7 @@ def slackReq():
'params' : maybe_get(req_data, 'text', default=''),
'summ' : lsa_summ
}
if "gensim" in req['params'].split():
if "gensim" in SUMMS and "gensim" in req['params'].split():
req['summ'] = None
return (SlackRouter().get_summary(**req))

Expand All @@ -51,7 +52,7 @@ def slackTestReq():
'summ' : lsa_summ,
'test' : True
}
if "gensim" in req['params'].split():
if "gensim" in SUMMS and "gensim" in req['params'].split():
req['summ'] = None
return (SlackRouter(test=True).get_summary(**req))

Expand Down
2 changes: 1 addition & 1 deletion slack_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def get_summary(self, **args):
self.logger.info(u'Using spacy')
summ_impl = SpacyTsSummarizer(self.build_interval(params))
summ_impl.set_summarizer(summ_object)
elif "gensim" in SUMMS:
else:
self.logger.info(u'Using gensim')
summ_impl = TextRankTsSummarizer(self.build_interval(params))
summary = summ_impl.report_summary(msgs)
Expand Down
56 changes: 22 additions & 34 deletions test_spacy_with_hypothesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import io
from sp_summarizer import (SpacyTsSummarizer)
import hypothesis.settings as hs
from interval_summarizer import (IntervalSpec, TsSummarizer, tagged_sum,
ts_to_time)
import lsa
Expand All @@ -28,27 +29,23 @@ def read_dir(fdir):

test_json_msgs_c3 = [read_dir(fdir) for fdir in ['api-test', 'calypso', 'games', 'happiness', 'hg', 'jetpack', 'jetpackfuel', 'livechat', 'tickets', 'vip']]

# for dirs in ['api-test', 'calypso', 'games', 'happiness', 'hg', 'jetpack', 'jetpackfuel', 'livechat', 'tickets', 'vip']:
# for jfile in glob.glob('./data/slack-logs-2/{}/*.json'.format(dirs)):
# test_json_msgs_c3 += json.load(io.open(jfile, encoding='utf-8'))

print len(test_json_msgs_c3)

class TestSummarize(unittest.TestCase):

test_msgs = test_json_msgs
summ = SpacyTsSummarizer([])
summ.set_summarizer(lsa.LsaSummarizer())


@given(
lists(elements=sampled_from(test_json_msgs), min_size=3),
integers(min_value=1, max_value=20)
integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
)
def test_text_rank_summarization_ds1_days(self, smp_msgs, days):
"""Generate something for N day interval"""
logger.info("Input is %s", smp_msgs)
asd = [{'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}]
summ = SpacyTsSummarizer(asd)
summ.set_summarizer(lsa.LsaSummarizer())
sumry = summ.summarize(smp_msgs)
TestSummarize.summ.set_interval(asd)
sumry = TestSummarize.summ.summarize(smp_msgs)
logger.debug("Summary is %s", sumry)
# Length of summary is at least 1 and no greater than 3
self.assertTrue(len(sumry) >= 1)
Expand All @@ -60,15 +57,14 @@ def test_text_rank_summarization_ds1_days(self, smp_msgs, days):

@given(
lists(elements=sampled_from(test_json_msgs_c2), min_size=12),
integers(min_value=1, max_value=20)
integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
)
def test_text_rank_summarization_ds2_days(self, smp_msgs, days):
"""Generate something for N day interval"""
logger.info("Input is %s", smp_msgs)
asd = [{'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}]
summ = SpacyTsSummarizer(asd)
summ.set_summarizer(lsa.LsaSummarizer())
sumry = summ.summarize(smp_msgs)
TestSummarize.summ.set_interval(asd)
sumry = TestSummarize.summ.summarize(smp_msgs)
logger.debug("Summary is %s", sumry)
# Length of summary is at least 1 and no greater than 3
self.assertTrue(len(sumry) >= 1)
Expand All @@ -80,18 +76,16 @@ def test_text_rank_summarization_ds2_days(self, smp_msgs, days):

@given(
integers(min_value=1, max_value=1000),
integers(min_value=1, max_value=20)
integers(min_value=1, max_value=20), settings=hs.Settings(timeout=1000)
)
def test_text_rank_summarization_ds3_days(self, sampsize, days):
"""Generate something for N day interval"""
ssamp = random.choice(test_json_msgs_c3)
samp = random.choice(test_json_msgs_c3)[random.randint(1,len(ssamp)-2):]
logger.info("Input is segment is %s", samp)
asd = [{'days': days, 'size' : 3, 'txt' : u'Summary for first {} days:\n'.format(days)}]
summ = SpacyTsSummarizer(asd)
summ.set_summarizer(lsa.LsaSummarizer())

sumry = summ.summarize(samp)
TestSummarize.summ.set_interval(asd)
sumry = TestSummarize.summ.summarize(samp)
logger.debug("Summary is %s", sumry)
# Length of summary is at least 1 and no greater than 3
self.assertTrue(len(sumry) >= 1)
Expand All @@ -102,16 +96,14 @@ def test_text_rank_summarization_ds3_days(self, sampsize, days):


@given(lists(elements=sampled_from(test_json_msgs), min_size=1),
integers(min_value=1, max_value=24)
integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
)
def test_text_rank_summarization_ds1_hours(self, smp_msgs, hours):
"""Generate something for N hour intervals"""
logger.info("Input is %s", smp_msgs)
asd = [{'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}]
summ = SpacyTsSummarizer(asd)
summ.set_summarizer(lsa.LsaSummarizer())

sumry = summ.summarize(smp_msgs)
TestSummarize.summ.set_interval(asd)
sumry = TestSummarize.summ.summarize(smp_msgs)
logger.debug("Summary is %s", sumry)
# Length of summary is at least 1 and no greater than 3
self.assertTrue(len(sumry) >= 1)
Expand All @@ -122,16 +114,14 @@ def test_text_rank_summarization_ds1_hours(self, smp_msgs, hours):


@given(lists(elements=sampled_from(test_json_msgs_c2), min_size=1),
integers(min_value=1, max_value=24)
integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
)
def test_text_rank_summarization_ds2_hours(self, smp_msgs, hours):
"""Generate something for N hour intervals"""
logger.info("Input is %s", smp_msgs)
asd = [{'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}]
summ = SpacyTsSummarizer(asd)
summ.set_summarizer(lsa.LsaSummarizer())

sumry = summ.summarize(smp_msgs)
TestSummarize.summ.set_interval(asd)
sumry = TestSummarize.summ.summarize(smp_msgs)
logger.debug("Summary is %s", sumry)
# Length of summary is at least 1 and no greater than 3
self.assertTrue(len(sumry) >= 1)
Expand All @@ -143,18 +133,16 @@ def test_text_rank_summarization_ds2_hours(self, smp_msgs, hours):

@given(
integers(min_value=2, max_value=1000),
integers(min_value=1, max_value=24)
integers(min_value=1, max_value=24), settings=hs.Settings(timeout=1000)
)
def test_text_rank_summarization_ds3_hours(self, sampsize, hours):
"""Generate something for N hour intervals"""
ssamp = random.choice(test_json_msgs_c3)
samp = random.choice(test_json_msgs_c3)[random.randint(1,len(ssamp)-2):]
logger.info("Input is segment is %s", samp)
asd = [{'hours': hours, 'size' : 3, 'txt' : u'Summary for first {} hours:\n'.format(hours)}]
summ = SpacyTsSummarizer(asd)
summ.set_summarizer(lsa.LsaSummarizer())

sumry = summ.summarize(samp)
TestSummarize.summ.set_interval(asd)
sumry = TestSummarize.summ.summarize(samp)
logger.debug("Summary is %s", sumry)
# Length of summary is at least 1 and no greater than 3
self.assertTrue(len(sumry) >= 1)
Expand Down
2 changes: 1 addition & 1 deletion ts_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@
DEBUG=True
LOG_FILE="./summary.log"
TEST_JSON="./data/test-events-elastic.json"
SUMMS=["gensim", "spacy"]
SUMMS=["spacy"]