Skip to content

Commit

Permalink
Merge pull request #2051 from teovin/court-listener-api
Browse files Browse the repository at this point in the history
add first draft of CL search
  • Loading branch information
teovin authored Jun 26, 2024
2 parents 9749150 + e05079e commit 3d7b319
Show file tree
Hide file tree
Showing 5 changed files with 295 additions and 35 deletions.
151 changes: 151 additions & 0 deletions web/main/case_xml_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
"""
Convert between XML and HTML versions of CAP's formatted case data.
"""

import lxml.sax
import lxml.html
import xml.sax

from lxml import etree

# sax functions passed to render_sax_tags
sax_start = lxml.sax.ElementTreeContentHandler.startElement
sax_end = lxml.sax.ElementTreeContentHandler.endElement
sax_chars = lxml.sax.ElementTreeContentHandler.characters

mapping = {
"casebody": "section",
"parties": "h4",
"docketnumber": "p",
"court": "p",
"decisiondate": "p",
"otherdate": "p",
"attorneys": "p",
"opinion": "article",
"author": "p",
"page-number": "a",
"extracted-citation": "a",
"bracketnum": "a",
"footnotemark": "a",
}


def render_sax_tags(tag_stack):
# run all of our commands, like "sax_start(*args)", to actually build the xml tree
handler = lxml.sax.ElementTreeContentHandler()
for method, args in tag_stack:
method(handler, *args)
return handler._root


class XmlToHtmlHandler(xml.sax.ContentHandler):
def __init__(self, case_id):
self.tag_stack = []
self.case_id = case_id
self.head_matter_open = False

def startElement(self, name, attrs):

if name == "casebody":
self.tag_stack.append(
(
sax_start,
(
"section",
{
"class": "casebody",
"data-case-id": self.case_id,
"data-firstpage": attrs["firstpage"],
"data-lastpage": attrs["lastpage"],
},
),
)
)
self.tag_stack.append((sax_chars, ("\n ",)))
self.tag_stack.append((sax_start, ("section", {"class": "head-matter"})))
self.head_matter_open = True
elif name == "opinion":
if self.head_matter_open:
self.close_head_matter()
# set opinion type to 'none' for opinions that don't have 'type' in source xml
attr_type = attrs.get("type", "none")
self.tag_stack.append(
(sax_start, ("article", {"class": "opinion", "data-type": attr_type}))
)
elif name == "page-number":
label = attrs["label"]
self.tag_stack.append(
(
sax_start,
(
"a",
{
"id": "p" + label,
"href": f"#p{label}",
"data-label": label,
"data-citation-index": attrs["citation-index"],
"class": "page-label",
},
),
)
)
elif name == "extracted-citation":
new_attrs = {"href": attrs["url"], "class": "citation", "data-index": attrs["index"]}
if "case-ids" in attrs:
new_attrs["data-case-ids"] = attrs["case-ids"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in ("footnotemark", "bracketnum"):
new_attrs = {"class": name}
if "href" in attrs:
new_attrs["href"] = attrs["href"]
if "id" in attrs:
new_attrs["id"] = attrs["id"]
self.tag_stack.append((sax_start, ("a", new_attrs)))
elif name in (
"parties",
"docketnumber",
"court",
"decisiondate",
"otherdate",
"attorneys",
"author",
"p",
"blockquote",
):
# content element
# set id to 'none' for elements that don't have 'id' in source xml
attrs_id = attrs.get("id", "none")
attrs = {"id": attrs_id}
if "data-blocks" in attrs:
attrs["data-blocks"] = attrs["data-blocks"]
if name not in ("p", "blockquote"):
attrs["class"] = name
new_name = "h4" if name == "parties" else "blockquote" if name == "blockquote" else "p"
if self.head_matter_open:
self.tag_stack.append((sax_chars, (" ",)))
self.tag_stack.append((sax_start, (new_name, attrs)))
else:
# passthrough
self.tag_stack.append((sax_start, (name, attrs)))

def characters(self, text):
if self.head_matter_open and text == " ":
text = " "
self.tag_stack.append((sax_chars, (text,)))

def endElement(self, name):
if name == "casebody" and self.head_matter_open:
self.close_head_matter()
self.tag_stack.append((sax_end, (mapping.get(name, name),)))

def close_head_matter(self):
self.tag_stack.append((sax_end, ("section",)))
self.tag_stack.append((sax_chars, ("\n ",)))
self.head_matter_open = False


def xml_to_html(input, case_id):
handler = XmlToHtmlHandler(case_id)
xml.sax.parseString(input, handler)
tree = render_sax_tags(handler.tag_stack)
return etree.tostring(tree, encoding=str, method="html")
160 changes: 129 additions & 31 deletions web/main/legal_document_sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,13 @@
from django.conf import settings
from django.contrib.postgres.search import SearchQuery, SearchRank, SearchVector
from pyquery import PyQuery
from main.case_xml_converter import xml_to_html

from main.utils import APICommunicationError, looks_like_case_law_link, looks_like_citation
from main.utils import (
APICommunicationError,
looks_like_case_law_link,
looks_like_citation,
)

vs_check = re.compile(" [vV][sS]?[.]? ")

Expand Down Expand Up @@ -519,8 +524,8 @@ def header_template(legal_document):
class CourtListener:
details = {
"name": "CourtListener",
"short_description": "hello",
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions",
"short_description": "CourtListener contains millions of legal opinions.",
"long_description": "CourtListener searches millions of opinions across hundreds of jurisdictions.",
"link": settings.COURTLISTENER_BASE_URL,
"search_regexes": [],
"footnote_regexes": [],
Expand All @@ -532,11 +537,7 @@ def search(search_params):
if not settings.COURTLISTENER_API_KEY:
raise APICommunicationError("A CourtListener API key is required")
try:
params = (
{"citation": search_params.q}
if looks_like_citation(search_params.q)
else {"q": search_params.q}
)
params = CourtListener.get_search_params(search_params)
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search",
params,
Expand All @@ -552,13 +553,16 @@ def search(search_params):
results.append(
{
"fullName": r["caseName"],
"shortName": r["caseName"],
"fullCitations": ", ".join(r["citation"]),
"shortCitations": ", ".join(r["citation"][:3])
+ ("..." if len(r["citation"]) > 3 else ""),
"effectiveDate": parser.isoparse(r["dateFiled"]).strftime("%Y-%m-%d"),
"shortName": truncate_name(r["caseName"]),
"fullCitations": ", ".join(r["citation"]) if r["citation"] else "",
"shortCitations": (
", ".join(r["citation"][:3]) + ("..." if len(r["citation"]) > 3 else "")
if r["citation"]
else ""
),
"effectiveDate": parser.isoparse(r["dateFiled"][:25]).strftime("%Y-%m-%d"),
"url": f"{settings.COURTLISTENER_BASE_URL}{r['absolute_url']}",
"id": r["id"],
"id": r["cluster_id"],
}
)
return results
Expand All @@ -576,38 +580,132 @@ def pull(legal_doc_source, id):
)
resp.raise_for_status()
cluster = resp.json()
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{id}/",
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)
resp.raise_for_status()

opinion = resp.json()
cluster["html_info"] = {"source": "court listener"}
cluster["sub_opinions"].sort(key=lambda url: int(url.split("/")[-2]))

sub_opinion_jsons = []
for opinion in cluster["sub_opinions"]:
sub_opinion_jsons.append(CourtListener.get_opinion_body(opinion))

text_source = ""
for content_type in (
"xml_harvard",
"html_with_citations",
"html_columbia",
"html_lawbox",
"html_anon_2020",
"html",
"plain_text",
):
case_text = "".join(sub_opinion[content_type] for sub_opinion in sub_opinion_jsons)
if case_text:
case_text = case_text.replace('<?xml version="1.0" encoding="utf-8"?>', "")
text_source = content_type
break

if not case_text:
msg = f"Case text not found for cluster {id}"
raise Exception(msg)

if text_source == "xml_harvard":
case_text = CourtListener.prepare_case_html(cluster, case_text)

cluster["html_info"]["source_field"] = text_source
additional_metadata = (CourtListener.get_additional_cluster_metadata(id))["results"][0]

except requests.exceptions.HTTPError as e:
msg = f"Failed call to {resp.request.url}: {e}\n{resp.content}"
raise APICommunicationError(msg)

body = opinion["html"]
citations = [
f"{x.get('volume')} {x.get('reporter')} {x.get('page')}" for x in cluster["citations"]
]

# https://www.courtlistener.com/help/api/rest/#case-names
case_name = cluster["case_name"] or cluster["case_name_full"][:10000]
cluster["court"] = {"name": additional_metadata.get("court")}
cluster["docket_number"] = additional_metadata.get("docketNumber")

case = LegalDocument(
source=legal_doc_source,
short_name=cluster["case_name"],
name=cluster["case_name"],
short_name=cluster.get("case_name"),
name=case_name,
doc_class="Case",
citations=cluster["citations"],
jurisdiction="",
effective_date=cluster["date_filed"],
publication_date=cluster["date_filed"],
citations=citations,
jurisdiction=cluster.get("court_id"),
effective_date=parser.parse(cluster.get("date_filed")),
publication_date=parser.parse(cluster.get("date_modified")),
updated_date=datetime.now(),
source_ref=str(id),
content=body,
metadata=None,
content=case_text,
metadata=cluster,
)

return case

@staticmethod
def header_template(legal_document):
return "empty_header.html"
return "court_listener_header.html"

@staticmethod
def get_opinion_body(sub_opinion_url):
opinion_num = int(sub_opinion_url.split("/")[-2])
resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/opinions/{opinion_num}/",
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)

resp.raise_for_status()
return resp.json()

@staticmethod
def prepare_case_html(cluster, opinions_xml):
xml_declaration = (
"<?xml version='1.0' encoding='utf-8'?>\n<casebody firstpage='0' lastpage='0'>"
)
case_xml = f"{xml_declaration}\n{cluster['headmatter']}\n{opinions_xml}</casebody>"
# 'mismatched br tag' and 'invalid attribute https:' error workarounds
case_xml = case_xml.replace("<br>", "").replace('https:=""', "")

try:
converted_case_html = xml_to_html(case_xml, str(cluster["id"]))
except Exception as e:
msg = f"Error converting xml to html for case {cluster['id']}: {e}"
raise Exception(msg)

return converted_case_html

@staticmethod
def get_search_params(search_params):
search_type_param = (
{"citation": search_params.q}
if looks_like_citation(search_params.q)
else {"q": search_params.q}
)
search_params = {
"filed_after": search_params.after_date,
"filed_before": search_params.before_date,
"court": search_params.jurisdiction,
}
params = {**search_type_param, **search_params}
return {k: params[k] for k in params.keys() if params[k] is not None}

@staticmethod
def get_additional_cluster_metadata(cluster_id):
"""
Additional metadata about a cluster such as court and docket number are available in search endpoint
Instead of clusters endpoint
"""
params = {"q": f"cluster_id:{cluster_id}"}

resp = requests.get(
f"{settings.COURTLISTENER_BASE_URL}/api/rest/v3/search",
params,
headers={"Authorization": f"Token {settings.COURTLISTENER_API_KEY}"},
)

resp.raise_for_status()
return resp.json()


class LegacyNoSearch:
Expand Down
6 changes: 5 additions & 1 deletion web/main/templates/export/as_printable_html/node.html
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@ <h2 class="subtitle">{{ node.subtitle }}</h2>

{% if node.resource_type.lower == 'legaldocument' %}
{% if node.resource.doc_class.lower == 'case' %}
{% include "includes/legal_doc_sources/cap_header.html" with legal_doc=node.resource %}
{% if node.resource.metadata.html_info.source == 'cap' %}
{% include "includes/legal_doc_sources/cap_header.html" with legal_doc=node.resource %}
{% elif node.resource.metadata.html_info.source == 'court listener' %}
{% include "includes/legal_doc_sources/court_listener_header.html" with legal_doc=node.resource %}
{% endif %}
{% elif node.resource.doc_class.lower == 'code' %}
{% include "includes/legal_doc_sources/gpo_header.html" with legal_doc=node.resource %}
{% endif %}
Expand Down
5 changes: 2 additions & 3 deletions web/main/templates/includes/legal_doc_sources/cap_header.html
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
<header class="case-header legal-doc-header" >
{% with md=legal_doc.metadata %}
<div class="court" data-custom-style="Case Header"> {{ md.court.name }}</div>
<div class="title" data-custom-style="Case Header">{{ legal_doc.get_title }}</div>
<div class="citation" data-custom-style="Case Header"> {{legal_doc.cite_string}} </div>
<div class="court" data-custom-style="Case Header">{{ md.court.name }}</div>
<div class="citation" data-custom-style="Case Header">{{ legal_doc.cite_string }}</div>
{% if md.docket_number %}<div class="docketnumber" data-custom-style="Case Header">{{ md.docket_number }}</div>{% endif %}
{% if md.decision_date %}<div class="decisiondate" data-custom-style="Case Header">{{ md.decision_date }}</div>{% endif %}
{% endwith %}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<header class="case-header legal-doc-header" >
{% with md=legal_doc.metadata %}
<div class="court" data-custom-style="Case Header">{{ md.court.name }}</div>
<div class="citation" data-custom-style="Case Header">{{ legal_doc.cite_string }}</div>
{% if md.docket_number %}<div class="docketnumber" data-custom-style="Case Header">{{ md.docket_number }}</div>{% endif %}
{% if md.date_filed %}<div class="decisiondate" data-custom-style="Case Header">{{ md.date_filed }}</div>{% endif %}
{% endwith %}
</header>

0 comments on commit 3d7b319

Please sign in to comment.