Skip to content

Commit

Permalink
[WIP]fix(mtd): fixing several anomalies for mtd sync with inpn mtd
Browse files Browse the repository at this point in the history
  • Loading branch information
VincentCauchois committed Oct 16, 2024
1 parent 3b2d232 commit 70759dc
Show file tree
Hide file tree
Showing 4 changed files with 213 additions and 32 deletions.
34 changes: 27 additions & 7 deletions backend/geonature/core/gn_meta/mtd/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
class MTDInstanceApi:
af_path = "/mtd/cadre/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}"
ds_path = "/mtd/cadre/jdd/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}"
# TODO: check if there are endpoints to retrieve metadata for a given user and instance, and not only a given user and whatever instance
ds_user_path = "/mtd/cadre/jdd/export/xml/GetRecordsByUserId?id={ID_ROLE}"
af_user_path = "/mtd/cadre/export/xml/GetRecordsByUserId?id={ID_ROLE}"
single_af_path = "/mtd/cadre/export/xml/GetRecordById?id={ID_AF}" # NOTE: `ID_AF` is actually an UUID and not an ID from the point of view of geonature database.
Expand All @@ -60,7 +61,10 @@ def _get_xml(self, path):
def _get_af_xml(self):
return self._get_xml(self.af_path)

def get_af_list(self):
# TODO: make the functions `get_af_list` and `get_ds_list` homogeneous
# - Use functions `parse_acquisition_framworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset`

def get_af_list(self) -> list:
xml = self._get_af_xml()
_xml_parser = etree.XMLParser(ns_clean=True, recover=True, encoding="utf-8")
root = etree.fromstring(xml, parser=_xml_parser)
Expand All @@ -73,7 +77,7 @@ def get_af_list(self):
def _get_ds_xml(self):
return self._get_xml(self.ds_path)

def get_ds_list(self):
def get_ds_list(self) -> list:
xml = self._get_ds_xml()
return parse_jdd_xml(xml)

Expand Down Expand Up @@ -206,11 +210,19 @@ def process_af_and_ds(af_list, ds_list, id_role=None):
add_unexisting_digitizer(af["id_digitizer"] if not id_role else id_role)
user_add_total_time += time.time() - start_add_user_time
af = sync_af(af)
# TODO: choose whether or not to commit retrieval of the AF before association of actors
# and possibly retrieve an AF without any actor associated to it
db.session.commit()
# If AF has not been synchronized ; due to the lack of a UUID ; actor cannot be associated to it
# and thus we skip to the next AF
if not af:
continue
associate_actors(
actors,
CorAcquisitionFrameworkActor,
"id_acquisition_framework",
af.id_acquisition_framework,
af.unique_acquisition_framework_id,
)
# TODO: remove actors removed from MTD
db.session.commit()
Expand All @@ -227,7 +239,13 @@ def process_af_and_ds(af_list, ds_list, id_role=None):
user_add_total_time += time.time() - start_add_user_time
ds = sync_ds(ds, list_cd_nomenclature)
if ds is not None:
associate_actors(actors, CorDatasetActor, "id_dataset", ds.id_dataset)
associate_actors(
actors,
CorDatasetActor,
"id_dataset",
ds.id_dataset,
ds.unique_dataset_id,
)

user_add_total_time = round(user_add_total_time, 2)
db.session.commit()
Expand All @@ -253,11 +271,13 @@ def sync_af_and_ds_by_user(id_role, id_af=None):
"""
Method to trigger MTD sync on user authentication.
Args:
id_role (int): The ID of the role (group or user).
id_af (str, optional): The ID of the AF (Acquisition Framework). Defaults to None.
Parameters
-----------
id_role : int
The ID of the role (group or user).
id_af : str, optional
The ID of an AF (Acquisition Framework).
"""

logger.info("MTD - SYNC USER : START")

# Create an instance of MTDInstanceApi
Expand Down
177 changes: 153 additions & 24 deletions backend/geonature/core/gn_meta/mtd/mtd_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import logging
import json
from copy import copy
import pprint
from typing import Literal, Union
from flask import current_app

from sqlalchemy import select, exists
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.exc import SQLAlchemyError, IntegrityError
from sqlalchemy.sql import func, update

from sqlalchemy.dialects.postgresql import insert as pg_insert

from geonature.utils.env import DB
from geonature.utils.env import DB, db
from geonature.core.gn_meta.models import (
TDatasets,
CorDatasetActor,
Expand All @@ -32,7 +34,8 @@
}

# get the root logger
log = logging.getLogger()
# log = logging.getLogger()
logger = logging.getLogger("MTD_SYNC")


def sync_ds(ds, cd_nomenclatures):
Expand All @@ -43,12 +46,22 @@ def sync_ds(ds, cd_nomenclatures):
:param ds: <dict> DS infos
:param cd_nomenclatures: <array> cd_nomenclature from ref_normenclatures.t_nomenclatures
"""

uuid_ds = ds["unique_dataset_id"]
name_ds = ds["dataset_name"]

logger.debug("MTD - PROCESSING DS WITH UUID '%s' AND NAME '%s'" % (uuid_ds, name_ds))

if not ds["cd_nomenclature_data_origin"]:
ds["cd_nomenclature_data_origin"] = "NSP"

# FIXME: the following temporary fix was added due to possible differences in referential of nomenclatures values between INPN and GeoNature
# should be fixed by ensuring that the two referentials are identical, at least for instances that integrates with INPN and thus rely on MTD synchronization from INPN Métadonnées: GINCO and DEPOBIO instances.
if ds["cd_nomenclature_data_origin"] not in cd_nomenclatures:
ds_cd_nomenclature_data_origin = ds["cd_nomenclature_data_origin"]
if ds_cd_nomenclature_data_origin not in cd_nomenclatures:
logger.warning(
f"MTD - Nomenclature with code '{ds_cd_nomenclature_data_origin}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'"
)
return

# CONTROL AF
Expand All @@ -62,7 +75,9 @@ def sync_ds(ds, cd_nomenclatures):
)

if af is None:
log.warning(f"AF with UUID '{af_uuid}' not found in database.")
logger.warning(
f"MTD - AF with UUID '{af_uuid}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'"
)
return

ds["id_acquisition_framework"] = af.id_acquisition_framework
Expand Down Expand Up @@ -121,7 +136,19 @@ def sync_af(af):
TAcquisitionFramework
The updated or inserted acquisition framework.
"""
# TODO: handle case where af_uuid is None ; as will raise an error at database level when executing the statement below ;
# af_uuid being None, i.e. af UUID is missing, could be due to no UUID specified in `<ca:identifiantCadre/>` tag in the XML file
# Solutions - if UUID is missing:
# - Just pass the sync of the AF
# - Generate a UUID for the AF
af_uuid = af["unique_acquisition_framework_id"]

if not af_uuid:
logger.warning(
f"No UUID provided for the AF with UUID '{af_uuid}' - SKIPPING SYNCHRONIZATION FOR THIS AF."
)
return None

af_exists = DB.session.scalar(
exists().where(TAcquisitionFramework.unique_acquisition_framework_id == af_uuid).select()
)
Expand Down Expand Up @@ -184,24 +211,39 @@ def add_or_update_organism(uuid, nom, email):
return DB.session.execute(statement).scalar()


def associate_actors(actors, CorActor, pk_name, pk_value):
def associate_actors(
actors,
CorActor: Union[CorAcquisitionFrameworkActor, CorDatasetActor],
pk_name: Literal["id_acquisition_framework", "id_dataset"],
pk_value: str,
uuid_mtd: str,
):
"""
Associate actor and DS or AF according to CorActor value.
Associate actors with either a given :
- Acquisition framework - writing to the table `gn_meta.cor_acquisition_framework_actor`.
- Dataset - writing to the table `gn_meta.cor_dataset_actor`.
Parameters
----------
actors : list
list of actors
CorActor : db.Model
table model
pk_name : str
pk attribute name
CorActor : CorAcquisitionFrameworkActor | CorDatasetActor
the SQLAlchemy model corresponding to the destination table
pk_name : Literal['id_acquisition_framework', 'id_dataset']
pk attribute name:
- 'id_acquisition_framework' for AF
- 'id_dataset' for DS
pk_value : str
pk value
pk value: ID of the AF or DS
uuid_mtd : str
UUID of the AF or DS
"""
type_mtd = "AF" if pk_name == "id_acquisition_framework" else "DS"
for actor in actors:
id_organism = None
uuid_organism = actor["uuid_organism"]
# TODO: choose whether to add or update an organism with no UUID specified
# - add or update it using organism name only - field `organism`
if uuid_organism:
with DB.session.begin_nested():
# create or update organisme
Expand All @@ -211,26 +253,65 @@ def associate_actors(actors, CorActor, pk_name, pk_value):
nom=actor["organism"] if actor["organism"] else "",
email=actor["email"],
)
# else:
# # Create a new organism in database from organism name
# # /!\ Do not use actor email as organism email - create the organism with a name only and generating a new UUID
# raise NotImplementedError(
# f"Creation of new organism, if no UUID provided for the organism actor, not implemented yet."
# )
values = dict(
id_nomenclature_actor_role=func.ref_nomenclatures.get_id_nomenclature(
"ROLE_ACTEUR", actor["actor_role"]
),
**{pk_name: pk_value},
)
if not id_organism:
values["id_role"] = DB.session.scalar(
select(User.id_role).filter_by(email=actor["email"])
)
else:
# TODO: choose wether to:
# - (retained) Try to associate to an organism first and then to a user
# - Try to associate to a user first and then to an organism
if id_organism:
values["id_organism"] = id_organism
statement = (
pg_insert(CorActor)
.values(**values)
.on_conflict_do_nothing(
index_elements=[pk_name, "id_organism", "id_nomenclature_actor_role"],
# TODO: handle case where no user is retrieved for the actor email:
# - (retained) Just do not try to associate the actor with the metadata
# - Try to retrieve and id_organism from the organism name - field `organism`
# - Try to retrieve and id_organism from the actor email considered as an organism email - field `email`
# - Try to insert a new user from the actor name - field `name` - and possibly also email - field `email`
else:
id_user_from_email = DB.session.scalar(
select(User.id_role).filter_by(email=actor["email"]).where(User.groupe.is_(False))
)
if id_user_from_email:
values["id_role"] = id_user_from_email
else:
# TODO: if actor role is "Contact Principal" ; id_nomenclature_actor_role = ? ; then a new user is created with a UUID and an ID only, but with no name nor email
# the metadata is then associated with this new user
raise NotImplementedError(
f"If actor role is 'Contact Principal': creation of a new user, if no organism retrieved nor known user from email retrieved, not implemented yet."
)
logger.warning(
f"MTD - actor association impossible for {type_mtd} with UUID '{uuid_mtd}' because no id_organism nor id_role could be retrieved - with the following actor information:\n"
+ format_str_dict_actor_for_logging(actor)
)
continue
try:
statement = (
pg_insert(CorActor)
.values(**values)
.on_conflict_do_nothing(
index_elements=[
pk_name,
"id_organism" if id_organism else "id_role",
"id_nomenclature_actor_role",
],
)
)
DB.session.execute(statement)
except IntegrityError as I:
db.session.rollback()
logger.error(
f"MTD - DB INTEGRITY ERROR - actor association failed for {type_mtd} with UUID '{uuid_mtd}' and following actor information:\n"
+ format_sqlalchemy_error_for_logging(I)
+ format_str_dict_actor_for_logging(actor)
)
)
DB.session.execute(statement)


def associate_dataset_modules(dataset):
Expand All @@ -246,3 +327,51 @@ def associate_dataset_modules(dataset):
)
).all()
)


def format_sqlalchemy_error_for_logging(error: SQLAlchemyError):
"""
Format SQLAlchemy error information in a nice way for MTD logging
Parameters
----------
error : SQLAlchemyError
the SQLAlchemy error
Returns
-------
str
formatted error information
"""
indented_original_error_message = str(error.orig).replace("\n", "\n\t")

formatted_error_message = "".join(
[
f"\t{indented_original_error_message}",
f"SQL QUERY: {error.statement}\n",
f"\tSQL PARAMS: {error.params}\n",
]
)

return formatted_error_message


def format_str_dict_actor_for_logging(actor: dict):
"""
Format actor information in a nice way for MTD logging
Parameters
----------
actor : dict
actor information: actor_role, email, name, organism, uuid_organism, ...
Returns
-------
str
formatted actor information
"""
formatted_str_dict_actor = "\tACTOR:\n\t\t" + pprint.pformat(actor).replace(
"\n", "\n\t\t"
).rstrip("\t")

return formatted_str_dict_actor
32 changes: 31 additions & 1 deletion backend/geonature/core/gn_meta/mtd/xml_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ def parse_actors_xml(actors):
return actor_list


# TODO: IMPORTANT - filter the list of acquisition frameworks with `ID_INSTANCE_FILTER` as made for the list of datasets

# TODO: make functions for AF and DS homogeneous: refactorize
# - Have functions `parse_acquisition_frameworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset`
# - Eventually split into distinct functions the XML parsing and the mapping of fields


def parse_acquisition_framwork_xml(xml):
"""
Parse an xml of AF from a string
Expand Down Expand Up @@ -130,10 +137,33 @@ def parse_jdd_xml(xml):

root = ET.fromstring(xml, parser=_xml_parser)
jdd_list = []

def format_acquisition_framework_id_from_xml(provided_af_uuid) -> str | None:
"""
Format the acquisition framework UUID provided for the dataset
i.e. the value for the tag `<jdd:identifiantCadre>` in the XML file
Args:
provided_af_uuid (str): The acquisition framework UUID
Returns:
str | None: The formatted acquisition framework UUID, or None if none was provided
"""
if not provided_af_uuid:
return None

if provided_af_uuid.startswith("http://oafs.fr/meta/ca/"):
return provided_af_uuid.split("/")[-1]

return provided_af_uuid

for jdd in root.findall(".//" + namespace + "JeuDeDonnees"):
# We extract all the required informations from the different tags of the XML file
jdd_uuid = get_tag_content(jdd, "identifiantJdd")
ca_uuid = get_tag_content(jdd, "identifiantCadre")
# TODO: handle case where value for the tag `<jdd:identifiantCadre>` in the XML file is not of the form `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
# Solutions - if in the form `http://oafs.fr/meta/ca/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` (has some entries for INPN MTD PREPROD and instance 'Nationale') :
# - (retained) Format by keeping only the `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` part
# - Add a check further in the MTD sync to process only if ca_uuid is in the right format
ca_uuid = format_acquisition_framework_id_from_xml(get_tag_content(jdd, "identifiantCadre"))
dataset_name = get_tag_content(jdd, "libelle")
dataset_shortname = get_tag_content(jdd, "libelleCourt", default_value="")
dataset_desc = get_tag_content(jdd, "description", default_value="")
Expand Down
Loading

0 comments on commit 70759dc

Please sign in to comment.