Skip to content

Commit

Permalink
fulltext works
Browse files Browse the repository at this point in the history
  • Loading branch information
cyrillkuettel committed Jul 4, 2024
1 parent c1c41df commit 084813e
Show file tree
Hide file tree
Showing 10 changed files with 296 additions and 181 deletions.
3 changes: 3 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ console_scripts =
add_meeting = privatim.cli.add_meeting:main
delete_meetings = privatim.cli.delete_meetings:main
upgrade = privatim.cli.upgrade:upgrade
print_tsvectors = privatim.cli.helpers:print_tsvectors
print_text = privatim.cli.helpers:print_text
reindex = privatim.cli.helpers:reindex
shell = privatim.cli.shell:shell

[options.extras_require]
Expand Down
81 changes: 81 additions & 0 deletions src/privatim/cli/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import click
from pyramid.paster import bootstrap
from sqlalchemy import select, func

from privatim.models import SearchableAssociatedFiles
from privatim.models.file import SearchableFile
from privatim.orm import Base


@click.command()
@click.argument('config_uri')
def print_tsvectors(config_uri: str) -> None:
"""
Iterate over all models inheriting from SearchableAssociatedFiles
and print their tsvector of searchable_text.
"""
env = bootstrap(config_uri)

with env['request'].tm:
db = env['request'].dbsession
seen = set()
for mapper in Base.registry.mappers:
cls = mapper.class_
if issubclass(cls, SearchableAssociatedFiles) and cls not in seen:
seen.add(cls)
click.echo(f"\nProcessing model: {cls.__name__}")
stmt = select(cls.searchable_text_de_CH)
results = db.execute(stmt).fetchall()
for id, tsvector in results:
click.echo(f"ID: {id}")
click.echo(f"TSVector: {tsvector}")
click.echo("---")


@click.command()
@click.argument('config_uri')
def print_text(config_uri: str) -> None:
"""
Iterate over all models inheriting from SearchableAssociatedFiles
and print their tsvector of searchable_text.
"""
env = bootstrap(config_uri)

with env['request'].tm:
db = env['request'].dbsession
seen = set()
for mapper in Base.registry.mappers:
cls = mapper.class_
if issubclass(cls, SearchableAssociatedFiles) and cls not in seen:
seen.add(cls)
click.echo(f"\nProcessing model: {cls.__name__}")
texts2 = db.execute(select(
func.string_agg(SearchableFile.extract, ' ')).select_from(
cls).join(cls.files).group_by(cls.id)).all()
for content in texts2:
click.echo(f"text_contents: {content}")
click.echo("---")


@click.command()
@click.argument('config_uri')
def reindex(config_uri: str) -> None:

env = bootstrap(config_uri)

with env['request'].tm:
db = env['request'].dbsession
seen = set()
for mapper in Base.registry.mappers:
cls = mapper.class_
if issubclass(cls, SearchableAssociatedFiles) and cls not in seen:
seen.add(cls)
click.echo(f"\nProcessing model: {cls.__name__}")

stmt = select(cls)
results = db.execute(stmt).scalars().fetchall()
for instance in results:
assert isinstance(instance, cls)
click.echo(f"\nReindexing model: {cls.__name__} with "
f"title: {instance.title[:30]}")
instance.reindex_files()
23 changes: 14 additions & 9 deletions src/privatim/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from sqlalchemy import event

from sqlalchemy.orm import configure_mappers, Mapper
from sqlalchemy.orm import configure_mappers


from privatim.i18n import locales
Expand Down Expand Up @@ -38,8 +38,8 @@
SearchableMixin


from typing import TYPE_CHECKING, Any # noqa: E402
from typing import Any as Incomplete
from typing import TYPE_CHECKING # noqa: E402
from typing import Any as Incomplete # noqa: E402
if TYPE_CHECKING:
from pyramid.config import Configurator
from sqlalchemy.orm import Mapper
Expand Down Expand Up @@ -74,8 +74,8 @@ def includeme(config: 'Configurator') -> None:
)


def update_fulltext_search_text_for_files(
mapper: 'Mapper[Incomplete]', connection: 'Connection', target: Incomplete
def update_fulltext_search_text(
mapper: 'Mapper[Incomplete]', connection: 'Connection', target: Incomplete
) -> None:
"""
Event listener for the 'files' relationship. Triggers a full reindex
Expand All @@ -93,11 +93,16 @@ def update_fulltext_search_text_for_files(
def register_search_listeners(
model: 'type[SearchableAssociatedFiles]',
) -> None:
event.listen(model, 'after_insert',
update_fulltext_search_text_for_files
event.listen(model, 'after_insert', update_fulltext_search_text)
event.listen(
model,
'after_update',
update_fulltext_search_text,
)
event.listen(model, 'after_update',
update_fulltext_search_text_for_files,
event.listen(
model,
'after_delete', # for edit form as well as delete
update_fulltext_search_text,
)


Expand Down
22 changes: 12 additions & 10 deletions src/privatim/models/associated_file.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import logging
from io import BytesIO

from sqlalchemy import func, Index
from sqlalchemy.dialects.postgresql import TSVECTOR
from sqlalchemy.orm import Mapped, deferred, mapped_column, declared_attr
from sqlalchemy_utils import observes
from sqlalchemy_utils.observer import PropertyObserver
from sqlalchemy.orm import Mapped, mapped_column, declared_attr

from privatim.i18n import locales
from privatim.models.file import GeneralFile, SearchableFile
Expand All @@ -19,15 +18,15 @@


class AssociatedFiles:
""" Use this mixin if uploaded files belong to a specific instance """
""" Use this mixin if uploaded files belong to a specific instance """

# one-to-many
files = associated(GeneralFile, 'files')


class SearchableAssociatedFiles:
""" One-to-many files that belong to a specific instance that have their
text contents extracted and stored in a single TSVECTOR column."""
""" Same as AssociatedFiles but provides the toolkit to make a list of
files searchable, if they are pdfs. """

__name__: ClassVar[str]

Expand All @@ -37,10 +36,10 @@ class SearchableAssociatedFiles:

@declared_attr
def searchable_text_de_CH(cls) -> Mapped[TSVECTOR]:
return deferred(mapped_column(
return mapped_column(
TSVECTOR,
nullable=True
))
)

# fixme: tricky to get typing right here.
@declared_attr
Expand All @@ -56,6 +55,8 @@ def __table_args__(cls): # type: ignore
def reindex_files(self) -> None:
"""Extract the text from the files and save it together with
the language.
Note that for now only pdfs are supported.
"""
files_by_locale: dict[str, list[SearchableFile]] = {
locale: [] for locale in locales
Expand All @@ -69,13 +70,14 @@ def reindex_files(self) -> None:
text = ''
for file in files_by_locale[locale]:
try:
pages, extract = extract_pdf_info(file.file)
pages, extract = extract_pdf_info(BytesIO(file.content))
file.extract = (extract or '').strip()
file.word_count = word_count(file.extract)
if file.extract:
text += '\n\n' + file.extract
except Exception as e:
logger.error(f"Error processing file {file.id}: {str(e)}")
logger.error(f"Error extracting text contents for file"
f" {file.id}: {str(e)}")

setattr(
self,
Expand Down
2 changes: 1 addition & 1 deletion src/privatim/models/consultation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
if TYPE_CHECKING:
from privatim.types import ACL
from sqlalchemy.orm import InstrumentedAttribute
from privatim.models import User, GeneralFile
from privatim.models import User
from privatim.models.file import SearchableFile


Expand Down
8 changes: 6 additions & 2 deletions src/privatim/models/meeting.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class AgendaItemCreationError(Exception):
)


class AgendaItem(Base):
class AgendaItem(Base, SearchableMixin):
""" Traktanden """

__tablename__ = 'agenda_items'
Expand Down Expand Up @@ -115,6 +115,11 @@ def create(
order_by='AgendaItem.position'
)

@classmethod
def searchable_fields(cls) -> Iterator['InstrumentedAttribute[str]']:
yield cls.title
yield cls.description

def __acl__(self) -> list['ACL']:
return [
(Allow, Authenticated, ['view']),
Expand Down Expand Up @@ -177,7 +182,6 @@ def __init__(

@classmethod
def searchable_fields(cls) -> Iterator['InstrumentedAttribute[str]']:
# todo: agenda item (seperately)
yield cls.name

def __acl__(self) -> list['ACL']:
Expand Down
1 change: 0 additions & 1 deletion src/privatim/models/searchable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from privatim.models.associated_file import SearchableAssociatedFiles
from privatim.orm import Base


Expand Down
1 change: 0 additions & 1 deletion src/privatim/views/activities.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from markupsafe import Markup
from sqlalchemy import select, union_all, desc, cast, String, literal
from privatim.models import Consultation, Meeting
from privatim.i18n import _
Expand Down
Loading

0 comments on commit 084813e

Please sign in to comment.