Skip to content

Commit

Permalink
Merge pull request #2 from fchrubasik/master
Browse files Browse the repository at this point in the history
reference extraction with HTML
  • Loading branch information
malteos authored Dec 19, 2019
2 parents c4359c8 + 4ae21bf commit a4cba7a
Show file tree
Hide file tree
Showing 18 changed files with 253 additions and 17 deletions.
7 changes: 4 additions & 3 deletions refex/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class RefExtractor(DivideAndConquerLawRefExtractorMixin, CaseRefExtractorMixin):
Reference marker format: [ref=UUID]...[/ref]
"""

do_law_refs = True
do_case_refs = True

Expand Down Expand Up @@ -44,14 +44,15 @@ def replace_content(self, content, reference_markers):

return content_with_markers

def extract(self, content_html: str):
def extract(self, content_html: str, is_html: bool=False):

reference_markers = [] # type: List[RefMarker]

# Remove all reference markers (HTML or MarkDown)
content = self.remove_markers(content_html)

if self.do_law_refs:
markers = self.extract_law_ref_markers(content)
markers = self.extract_law_ref_markers(content, is_html)
reference_markers.extend(markers)

logger.debug('Extracted law ref markers: %i' % len(markers))
Expand Down
7 changes: 4 additions & 3 deletions refex/extractors/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,12 @@ def get_court_name_regex(self):
'Bundespatentgericht', 'BPatG',
'Truppendienstgericht Nord', 'TDG Nord',
'Truppendienstgericht Süd', 'TDG Süd',
'EUGH',
'EUGH', 'Truppendienstgericht Süd',
'TDG Süd',
]
states = [
'Berlin',
'Baden-Württemberg', 'BW',
'Baden-Württemberg', 'BW', 'Baden-Württemberg',
'Brandenburg', 'Brandenburgisches',
'Bremen',
'Hamburg',
Expand All @@ -81,7 +82,7 @@ def get_court_name_regex(self):
'Sachsen',
'Sachsen-Anhalt',
'Schleswig-Holstein', 'Schl.-Holst.', 'SH',
'Thüringen'
'Thüringen', 'Thüringen',
]
state_courts = [
'OVG',
Expand Down
31 changes: 24 additions & 7 deletions refex/extractors/law_dnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class DivideAndConquerLawRefExtractorMixin(object):
# All text non-word symbols
word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|"|\'|<|>|&'

def extract_law_ref_markers(self, content: str) -> List[RefMarker]:
def extract_law_ref_markers(self, content: str, is_html: bool=False) -> List[RefMarker]:
"""
The main extraction method. Takes input content and returns content with markers and list of extracted references.
Expand All @@ -57,17 +57,32 @@ def extract_law_ref_markers(self, content: str) -> List[RefMarker]:
# Init
markers = []

# Replace special characters if working with html
if is_html:
sectionSign = '&#167;'
self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|&#8221;|\&#8216;|\&#8217;|&#60;|&#62;|&#38;|&rdquo;|\&lsquo;|\&rsquo;|&lt;|&gt;|&amp;|"|\'|<|>|&'
else:
sectionSign = '§'
self.word_delimiter = '\s|\.|,|;|:|!|\?|\(|\)|\[|\]|"|\'|<|>|&'



book_look_ahead = '(?=' + self.word_delimiter + ')' # book code should be followed by a word separator, e.g. space.

# Single ref
book_pattern = self.get_law_book_ref_regex(self.get_law_book_codes())



# Any content
any_content = '(\s?([0-9]{1,5}(\.{,1})|[a-z]{1,2}|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s))*'
any_content = '([0-9]{1,5}|\.|[a-z]|[IXV]{1,3}|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|und|bis|,|;|\s)*'


multi_pattern = '§§ (\s|[0-9]+(\.{,1})|[a-z]|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|f\.|ff\.|und|bis|\,|;|\s'+ book_pattern + ')+\s(' + book_pattern + ')' + book_look_ahead
multi_pattern = sectionSign + sectionSign + ' (\s|[0-9]+(\.{,1})|[a-z]|Abs\.|Abs|Satz|Halbsatz|S\.|Nr|Nr\.|Alt|Alt\.|f\.|ff\.|und|bis|\,|;|\s'+ book_pattern + ')+\s(' + book_pattern + ')' + book_look_ahead




for marker_match in re.finditer(re.compile(multi_pattern), content): # All matches
marker_text = marker_match.group(0)
Expand All @@ -94,7 +109,8 @@ def extract_law_ref_markers(self, content: str) -> List[RefMarker]:
a = '([0-9]+)\s(?=bis|und)'
b = '([0-9]+)\s?[a-z]'
c = '([0-9]+)'
pattern = '(?P<sep>§§|,|;|und|bis)\s?(?P<sect>(' + a + '|' + b + '|' + c + '))'
pattern = '(?P<sep>' + sectionSign + sectionSign + '|,|;|und|bis)\s?(?P<sect>(' + a + '|' + b + '|' + c + '))'


for ref_match in re.finditer(re.compile(pattern), marker_text):
sect = ref_match.group('sect')
Expand Down Expand Up @@ -150,14 +166,15 @@ def extract_law_ref_markers(self, content: str) -> List[RefMarker]:
sect_pattern = '(?P<sect>([0-9]+)(\s?[a-z]?))'
patterns = [
# § 3 BGB, § 3d BGB, § 83 d BGB
' + sect_pattern + ' (?P<book>' + book_pattern + ')' + book_look_ahead,
sectionSign + ' ' + sect_pattern + ' (?P<book>' + book_pattern + ')' + book_look_ahead,
# Abs OR Nr
# § 42 Abs. 1 Alt. 1 VwGO
 ' + sect_pattern + ' Abs. ([0-9]+) Alt. ([0-9]+) (?P<book>' + book_pattern + ')' + book_look_ahead,
(?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<book>(' + book_pattern + '))' + book_look_ahead,
(?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<next_book>(i\.V\.m\.|iVm))' + book_look_ahead,
sectionSign + ' ' + sect_pattern + ' Abs. ([0-9]+) Alt. ([0-9]+) (?P<book>' + book_pattern + ')' + book_look_ahead,
sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<book>(' + book_pattern + '))' + book_look_ahead,
sectionSign + ' (?P<sect>([0-9]+)(\s?[a-z]?)) ' + any_content + ' (?P<next_book>(i\.V\.m\.|iVm))' + book_look_ahead,

]

markers_waiting_for_book = [] # type: List[RefMarker]

for pattern in patterns: # Iterate over all patterns
Expand Down
6 changes: 3 additions & 3 deletions refex/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ def get_book_codes_from_file(self, file_name='law_book_codes.txt'):
return [line.strip() for line in f.readlines()]


def assert_refs(self, fixtures):
def assert_refs(self, fixtures, is_html: bool=False):
for i, test in enumerate(fixtures):
if 'resource' in test and 'content' not in test:
with open(os.path.join(self.resource_dir, test['resource'])) as f:
test['content'] = ''.join(f.readlines())

new_content, markers = self.extractor.extract(test['content'])
new_content, markers = self.extractor.extract(test['content'], is_html)

ref_ids = []
for ref in markers: # type: RefMarker
Expand All @@ -41,4 +41,4 @@ def assert_refs(self, fixtures):
logger.debug('actual (%i): %s' % (len(ref_ids), ref_ids))
logger.debug('expected (%i): %s' % (len(test['refs']), test['refs']))

self.assertListEqual(ref_ids, test['refs'], 'Invalid ids returned (test #%i)' % i)
self.assertListEqual(ref_ids, test['refs'], 'Invalid ids returned (test #%i)' % i)
1 change: 1 addition & 0 deletions refex/tests/resources/law/extract15.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Die Zulassung der Berufung folgt aus &#167;&#167; 124 Abs. 2 Nr. 3, 124 a Abs. 1 Satz 1 VwGO wegen grundsätzlicher Bedeutung.
1 change: 1 addition & 0 deletions refex/tests/resources/law/extract16.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Die Entscheidung über die vorläufige Vollstreckbarkeit folgt aus &#167; 167 VwGO i.V.m. &#167;&#167; 708 Nr. 11, 711 ZPO.
3 changes: 3 additions & 0 deletions refex/tests/resources/law/extract17.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Dies gilt grundsätzlich für die planerisch ausgewiesenen und die faktischen
(&#167; 34 Abs. 2 BauGB) Baugebiete nach &#167;&#167; 2 bis 4 BauNVO, die Ergebnis eines typisierenden
Ausgleichs möglicher Nutzungskonflikte sind. Setzt die Gemeinde einen entsprechenden Gebietstyp fest
4 changes: 4 additions & 0 deletions refex/tests/resources/law/extract18.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Die Kostenentscheidung beruht auf &#167; 154 Abs. 1 VwGO. Die außergerichtlichen Kosten des'
' beigeladenen Ministeriums waren für erstattungsfähig zu erklären, da dieses einen '
'Sachantrag gestellt hat und damit ein Kostenrisiko eingegangen ist '
'(vgl. &#167;&#167; 162 Abs. 3, 154 Abs. 3 VwGO).
4 changes: 4 additions & 0 deletions refex/tests/resources/law/extract19.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
'2. Der Klagantrag zu 2. ist unzulässig. Es handelt sich um einen Anfechtungsantrag '
'nach &#167; 42 Abs. 1 Alt. 1 VwGO bezüglich der seitens des beigeladenen Ministeriums '
'getroffenen ergänzenden Abweichungsentscheidung vom 13.05.2016 in Gestalt des'
' Widerspruchsbescheides vom 14.08.2016.'
2 changes: 2 additions & 0 deletions refex/tests/resources/law/extract20.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Die Kostenentscheidung beruht auf &#167; 154 Abs. 1 VwGO.
(vgl. &#167;&#167; 162 Abs. 3, 154 Abs. 3 VwGO).
7 changes: 7 additions & 0 deletions refex/tests/resources/law/extract21.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Soweit der Kläger die Klage zurückgenommen hat, wird das Verfahren eingestellt.

Im Übrigen wird die Beklagte unter teilweiser Aufhebung des Bescheides auf Basis von &#167; 77 Abs. 1 Satz 1, 1. Halbsatz AsylG
vom 27. April 2016 in Gestalt des Beschwerdebescheides vom 21. September 2016 verpflichtet, über die als ruhegehaltfähig
anerkannten Zeiten hinaus dem Kläger die Zeit seiner Tätigkeit als wissenschaftlicher Angestellter an der
Universität ... vom 01. März 1981 bis zum 31. März 1985 in vollem Umfang als ruhegehaltfähig anzuerkennen.

6 changes: 6 additions & 0 deletions refex/tests/resources/law/extract22.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Soweit der Kläger die Klage zurückgenommen hat, wird das Verfahren eingestellt.

Im Übrigen wird die Beklagte unter teilweiser Aufhebung des Bescheides auf Basis von &#167;&#167; 52 Abs. 1; 53 Abs. 2 Nr. 1; 63 Abs. 2 StPO
vom 27. April 2016 in Gestalt des Beschwerdebescheides vom 21. September 2016 verpflichtet, über die als ruhegehaltfähig
anerkannten Zeiten hinaus dem Kläger die Zeit seiner Tätigkeit als wissenschaftlicher Angestellter an der
Universität ... vom 01. März 1981 bis zum 31. März 1985 in vollem Umfang als ruhegehaltfähig anzuerkennen.
6 changes: 6 additions & 0 deletions refex/tests/resources/law/extract23.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Umstritten ist die Wirksamkeit der Abtretung nach Art 12 Abs 1 GG von Honoraransprüchen eines Vertragszahnarztes
gegen die Kassenzahnärztliche Vereinigung (KZÄV).

4

Mit ihren Revisionen machen der Kläger und der Beigeladene in erster Linie geltend, das Abtretungsverbot Art. 1, 2, 3 GG.
1 change: 1 addition & 0 deletions refex/tests/resources/law/extract24.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Foo &#167;&#167; 556d, 556e BGB bar.
1 change: 1 addition & 0 deletions refex/tests/resources/law/extract25.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Bar und bar &#167;&#167; 1, 2 Abs. 2, 3, 10 Abs. 1 Nr. 1 BGB foo.
1 change: 1 addition & 0 deletions refex/tests/resources/law/extract26.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<h2>Tenor</h2><p>foo</p><p></p>In diesem Satz sind Zitate nach &#167; 3d AsylG, aber auch &#167; 123 VwGO. Komplexe Zitate gibt es auch &#167;&#167; 3, 3b AsylG</p>
4 changes: 4 additions & 0 deletions refex/tests/resources/law/extract27.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Beklagten fehlen dem Urteil des LSG auch nicht die nach &#167; 136 Abs 1 Nr 6 SGG erforderlichen Entscheidungsgründe.

eurteilung der Voraussetzungen des &#167; 48 Abs 1 Satz 2 Nr 4 SGB X keine Rechtsfragen.

178 changes: 177 additions & 1 deletion refex/tests/test_law_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,183 @@ def test_extract14(self):
}
])

def test_extract15(self):
self.assert_refs([
{
'resource': 'law/extract15.txt',
'refs': [
Ref(ref_type=RefType.LAW, book='vwgo', section='124'),
Ref(ref_type=RefType.LAW, book='vwgo', section='124a'),
]
}
], True)

def test_extract16(self):
self.assert_refs([
{
'resource': 'law/extract16.txt',
'refs': [
Ref(ref_type=RefType.LAW, book='vwgo', section='167'),
Ref(ref_type=RefType.LAW, book='zpo', section='708'),
Ref(ref_type=RefType.LAW, book='zpo', section='711'),
]
}
], True)

def test_extract17(self):
self.assert_refs([
{
'resource': 'law/extract17.txt',
'refs': [
Ref(ref_type=RefType.LAW, book='baugb', section='34'),
Ref(ref_type=RefType.LAW, book='baunvo', section='2'),
Ref(ref_type=RefType.LAW, book='baunvo', section='3'),
Ref(ref_type=RefType.LAW, book='baunvo', section='4'),

]
}
], True)

def test_extract18(self):
self.assert_refs([
{
'resource': 'law/extract18.txt',
'refs': [
Ref(ref_type=RefType.LAW, book='vwgo', section='154'),
Ref(ref_type=RefType.LAW, book='vwgo', section='154'),
Ref(ref_type=RefType.LAW, book='vwgo', section='162'),
]
}
], True)

def test_extract19(self):
self.assert_refs([
{
'resource': 'law/extract19.txt',
'refs': [
Ref(ref_type=RefType.LAW, book='vwgo', section='42')
]
}
], True)


def test_extract20(self):
self.assert_refs([
{
'resource': 'law/extract20.txt',
'refs': [
Ref(ref_type=RefType.LAW, book='vwgo', section='154'),
Ref(ref_type=RefType.LAW, book='vwgo', section='154'),
Ref(ref_type=RefType.LAW, book='vwgo', section='162'),
]
}
], True)

def test_extract21(self):
self.assert_refs([
{
'resource': 'law/extract21.txt',
'refs': [
# § 77 Abs. 1 Satz 1, 1. Halbsatz AsylG
Ref(ref_type=RefType.LAW, book='asylg', section='77')
]
}
], True)

def test_extract22(self):
self.assert_refs([
{
'resource': 'law/extract22.txt',
'refs': [
# §§ 52 Abs. 1; 53 Abs. 2 Nr. 1; 63 Abs. 2 StPO
Ref(ref_type=RefType.LAW, book='stpo', section='52'),
Ref(ref_type=RefType.LAW, book='stpo', section='53'),
Ref(ref_type=RefType.LAW, book='stpo', section='63'),

]
}
], True)

@skip
def test_extract23(self):
self.assert_refs([
{
'resource': 'law/extract23.txt',
'refs': [
# Art 12 Abs 1 GG
Ref(ref_type=RefType.LAW, book='gg', section='1'),
Ref(ref_type=RefType.LAW, book='gg', section='2'),
Ref(ref_type=RefType.LAW, book='gg', section='3'),
Ref(ref_type=RefType.LAW, book='gg', section='12'),

# Ref(ref_type=RefType.LAW, book='stpo', section='53'),
# Ref(ref_type=RefType.LAW, book='stpo', section='63'),

]
}
], True)

# @skip
def test_extract24(self):
self.assert_refs([
{
'resource': 'law/extract24.txt',
'refs': [
# §§ 556d, 556g BGB
Ref(ref_type=RefType.LAW, book='bgb', section='556d'),
Ref(ref_type=RefType.LAW, book='bgb', section='556e'),

]
}
], True)

def test_extract25(self):
self.assert_refs([
{
'resource': 'law/extract25.txt',
'refs': [
# §§ 1, 2 Abs. 2, 3, 10 Abs. 1 Nr. 1 BGB
Ref(ref_type=RefType.LAW, book='bgb', section='1'),
Ref(ref_type=RefType.LAW, book='bgb', section='2'),
Ref(ref_type=RefType.LAW, book='bgb', section='3'),
Ref(ref_type=RefType.LAW, book='bgb', section='10'),

]
}
], True)


def test_extract26(self):
self.assert_refs([
{
'resource': 'law/extract26.txt',
'refs': [
# § 3d AsylG, aber auch § 123 VwGO. ... auch §§ 3, 3b AsylG
Ref(ref_type=RefType.LAW, book='asylg', section='3'),
Ref(ref_type=RefType.LAW, book='asylg', section='3b'),
Ref(ref_type=RefType.LAW, book='asylg', section='3d'),
Ref(ref_type=RefType.LAW, book='vwgo', section='123'),

]
}
], True)

def test_extract27(self):
self.assert_refs([
{
'resource': 'law/extract27.txt',
'refs': [
# duplicated book code parts
Ref(ref_type=RefType.LAW, book='sgg', section='136'),
Ref(ref_type=RefType.LAW, book='sgb x', section='48'),

]
}
], True)




def test_citation_styles(self):
# TODO insert citation styles into text, random location, single and multiple occurences, test on marker text
with open(os.path.join(self.resource_dir, 'citation_styles.txt')) as f:
Expand Down Expand Up @@ -300,4 +477,3 @@ def test_alternative_law_book_regex(self):
for code in self.get_book_codes_from_file() + ['SGB X', 'SGG', 'SGB IV']:
if not pattern.search(code):
print(code)

0 comments on commit a4cba7a

Please sign in to comment.