[WIP]fix(mtd): fixing several anomalies for mtd sync with inpn mtd

PnX-SI · Oct 16, 2024 · 70759dc · 70759dc
1 parent 3b2d232
commit 70759dc
Show file tree

Hide file tree

Showing 4 changed files with 213 additions and 32 deletions.
diff --git a/backend/geonature/core/gn_meta/mtd/__init__.py b/backend/geonature/core/gn_meta/mtd/__init__.py
@@ -36,6 +36,7 @@
 class MTDInstanceApi:
     af_path = "/mtd/cadre/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}"
     ds_path = "/mtd/cadre/jdd/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}"
+    # TODO: check if there are endpoints to retrieve metadata for a given user and instance, and not only a given user and whatever instance
     ds_user_path = "/mtd/cadre/jdd/export/xml/GetRecordsByUserId?id={ID_ROLE}"
     af_user_path = "/mtd/cadre/export/xml/GetRecordsByUserId?id={ID_ROLE}"
     single_af_path = "/mtd/cadre/export/xml/GetRecordById?id={ID_AF}"  # NOTE: `ID_AF` is actually an UUID and not an ID from the point of view of geonature database.
@@ -60,7 +61,10 @@ def _get_xml(self, path):
     def _get_af_xml(self):
         return self._get_xml(self.af_path)
 
-    def get_af_list(self):
+    # TODO: make the functions `get_af_list` and `get_ds_list` homogeneous
+    #   - Use functions `parse_acquisition_framworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset`
+
+    def get_af_list(self) -> list:
         xml = self._get_af_xml()
         _xml_parser = etree.XMLParser(ns_clean=True, recover=True, encoding="utf-8")
         root = etree.fromstring(xml, parser=_xml_parser)
@@ -73,7 +77,7 @@ def get_af_list(self):
     def _get_ds_xml(self):
         return self._get_xml(self.ds_path)
 
-    def get_ds_list(self):
+    def get_ds_list(self) -> list:
         xml = self._get_ds_xml()
         return parse_jdd_xml(xml)
 
@@ -206,11 +210,19 @@ def process_af_and_ds(af_list, ds_list, id_role=None):
             add_unexisting_digitizer(af["id_digitizer"] if not id_role else id_role)
             user_add_total_time += time.time() - start_add_user_time
         af = sync_af(af)
+        # TODO: choose whether or not to commit retrieval of the AF before association of actors
+        #   and possibly retrieve an AF without any actor associated to it
+        db.session.commit()
+        # If AF has not been synchronized ; due to the lack of a UUID ; actor cannot be associated to it
+        #   and thus we skip to the next AF
+        if not af:
+            continue
         associate_actors(
             actors,
             CorAcquisitionFrameworkActor,
             "id_acquisition_framework",
             af.id_acquisition_framework,
+            af.unique_acquisition_framework_id,
         )
         # TODO: remove actors removed from MTD
     db.session.commit()
@@ -227,7 +239,13 @@ def process_af_and_ds(af_list, ds_list, id_role=None):
             user_add_total_time += time.time() - start_add_user_time
         ds = sync_ds(ds, list_cd_nomenclature)
         if ds is not None:
-            associate_actors(actors, CorDatasetActor, "id_dataset", ds.id_dataset)
+            associate_actors(
+                actors,
+                CorDatasetActor,
+                "id_dataset",
+                ds.id_dataset,
+                ds.unique_dataset_id,
+            )
 
     user_add_total_time = round(user_add_total_time, 2)
     db.session.commit()
@@ -253,11 +271,13 @@ def sync_af_and_ds_by_user(id_role, id_af=None):
     """
     Method to trigger MTD sync on user authentication.
 
-    Args:
-        id_role (int): The ID of the role (group or user).
-        id_af (str, optional): The ID of the AF (Acquisition Framework). Defaults to None.
+    Parameters
+    -----------
+    id_role : int
+        The ID of the role (group or user).
+    id_af : str, optional
+        The ID of an AF (Acquisition Framework).
     """
-
     logger.info("MTD - SYNC USER : START")
 
     # Create an instance of MTDInstanceApi

diff --git a/backend/geonature/core/gn_meta/mtd/mtd_utils.py b/backend/geonature/core/gn_meta/mtd/mtd_utils.py
@@ -1,15 +1,17 @@
 import logging
 import json
 from copy import copy
+import pprint
+from typing import Literal, Union
 from flask import current_app
 
 from sqlalchemy import select, exists
-from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.exc import SQLAlchemyError, IntegrityError
 from sqlalchemy.sql import func, update
 
 from sqlalchemy.dialects.postgresql import insert as pg_insert
 
-from geonature.utils.env import DB
+from geonature.utils.env import DB, db
 from geonature.core.gn_meta.models import (
     TDatasets,
     CorDatasetActor,
@@ -32,7 +34,8 @@
 }
 
 # get the root logger
-log = logging.getLogger()
+# log = logging.getLogger()
+logger = logging.getLogger("MTD_SYNC")
 
 
 def sync_ds(ds, cd_nomenclatures):
@@ -43,12 +46,22 @@ def sync_ds(ds, cd_nomenclatures):
     :param ds: <dict> DS infos
     :param cd_nomenclatures: <array> cd_nomenclature from ref_normenclatures.t_nomenclatures
     """
+
+    uuid_ds = ds["unique_dataset_id"]
+    name_ds = ds["dataset_name"]
+
+    logger.debug("MTD - PROCESSING DS WITH UUID '%s' AND NAME '%s'" % (uuid_ds, name_ds))
+
     if not ds["cd_nomenclature_data_origin"]:
         ds["cd_nomenclature_data_origin"] = "NSP"
 
     # FIXME: the following temporary fix was added due to possible differences in referential of nomenclatures values between INPN and GeoNature
     #     should be fixed by ensuring that the two referentials are identical, at least for instances that integrates with INPN and thus rely on MTD synchronization from INPN Métadonnées: GINCO and DEPOBIO instances.
-    if ds["cd_nomenclature_data_origin"] not in cd_nomenclatures:
+    ds_cd_nomenclature_data_origin = ds["cd_nomenclature_data_origin"]
+    if ds_cd_nomenclature_data_origin not in cd_nomenclatures:
+        logger.warning(
+            f"MTD - Nomenclature with code '{ds_cd_nomenclature_data_origin}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'"
+        )
         return
 
     # CONTROL AF
@@ -62,7 +75,9 @@ def sync_ds(ds, cd_nomenclatures):
     )
 
     if af is None:
-        log.warning(f"AF with UUID '{af_uuid}' not found in database.")
+        logger.warning(
+            f"MTD - AF with UUID '{af_uuid}' not found in database - SKIPPING SYNCHRONIZATION OF DATASET WITH UUID '{uuid_ds}' AND NAME '{name_ds}'"
+        )
         return
 
     ds["id_acquisition_framework"] = af.id_acquisition_framework
@@ -121,7 +136,19 @@ def sync_af(af):
     TAcquisitionFramework
         The updated or inserted acquisition framework.
     """
+    # TODO: handle case where af_uuid is None ; as will raise an error at database level when executing the statement below ;
+    #   af_uuid being None, i.e. af UUID is missing, could be due to no UUID specified in `<ca:identifiantCadre/>` tag in the XML file
+    #   Solutions - if UUID is missing:
+    #       - Just pass the sync of the AF
+    #       - Generate a UUID for the AF
     af_uuid = af["unique_acquisition_framework_id"]
+
+    if not af_uuid:
+        logger.warning(
+            f"No UUID provided for the AF with UUID '{af_uuid}' - SKIPPING SYNCHRONIZATION FOR THIS AF."
+        )
+        return None
+
     af_exists = DB.session.scalar(
         exists().where(TAcquisitionFramework.unique_acquisition_framework_id == af_uuid).select()
     )
@@ -184,24 +211,39 @@ def add_or_update_organism(uuid, nom, email):
     return DB.session.execute(statement).scalar()
 
 
-def associate_actors(actors, CorActor, pk_name, pk_value):
+def associate_actors(
+    actors,
+    CorActor: Union[CorAcquisitionFrameworkActor, CorDatasetActor],
+    pk_name: Literal["id_acquisition_framework", "id_dataset"],
+    pk_value: str,
+    uuid_mtd: str,
+):
     """
-    Associate actor and DS or AF according to CorActor value.
+    Associate actors with either a given :
+    - Acquisition framework - writing to the table `gn_meta.cor_acquisition_framework_actor`.
+    - Dataset - writing to the table `gn_meta.cor_dataset_actor`.
 
     Parameters
     ----------
     actors : list
         list of actors
-    CorActor : db.Model
-        table model
-    pk_name : str
-        pk attribute name
+    CorActor : CorAcquisitionFrameworkActor | CorDatasetActor
+        the SQLAlchemy model corresponding to the destination table
+    pk_name : Literal['id_acquisition_framework', 'id_dataset']
+        pk attribute name:
+        - 'id_acquisition_framework' for AF
+        - 'id_dataset' for DS
     pk_value : str
-        pk value
+        pk value: ID of the AF or DS
+    uuid_mtd : str
+        UUID of the AF or DS
     """
+    type_mtd = "AF" if pk_name == "id_acquisition_framework" else "DS"
     for actor in actors:
         id_organism = None
         uuid_organism = actor["uuid_organism"]
+        # TODO: choose whether to add or update an organism with no UUID specified
+        #   - add or update it using organism name only - field `organism`
         if uuid_organism:
             with DB.session.begin_nested():
                 # create or update organisme
@@ -211,26 +253,65 @@ def associate_actors(actors, CorActor, pk_name, pk_value):
                     nom=actor["organism"] if actor["organism"] else "",
                     email=actor["email"],
                 )
+        # else:
+        #     # Create a new organism in database from organism name
+        #     #   /!\ Do not use actor email as organism email - create the organism with a name only and generating a new UUID
+        #     raise NotImplementedError(
+        #         f"Creation of new organism, if no UUID provided for the organism actor, not implemented yet."
+        #     )
         values = dict(
             id_nomenclature_actor_role=func.ref_nomenclatures.get_id_nomenclature(
                 "ROLE_ACTEUR", actor["actor_role"]
             ),
             **{pk_name: pk_value},
         )
-        if not id_organism:
-            values["id_role"] = DB.session.scalar(
-                select(User.id_role).filter_by(email=actor["email"])
-            )
-        else:
+        # TODO: choose wether to:
+        #   - (retained) Try to associate to an organism first and then to a user
+        #   - Try to associate to a user first and then to an organism
+        if id_organism:
             values["id_organism"] = id_organism
-        statement = (
-            pg_insert(CorActor)
-            .values(**values)
-            .on_conflict_do_nothing(
-                index_elements=[pk_name, "id_organism", "id_nomenclature_actor_role"],
+        # TODO: handle case where no user is retrieved for the actor email:
+        #   - (retained) Just do not try to associate the actor with the metadata
+        #   - Try to retrieve and id_organism from the organism name - field `organism`
+        #   - Try to retrieve and id_organism from the actor email considered as an organism email - field `email`
+        #   - Try to insert a new user from the actor name - field `name` - and possibly also email - field `email`
+        else:
+            id_user_from_email = DB.session.scalar(
+                select(User.id_role).filter_by(email=actor["email"]).where(User.groupe.is_(False))
+            )
+            if id_user_from_email:
+                values["id_role"] = id_user_from_email
+            else:
+                # TODO: if actor role is "Contact Principal" ; id_nomenclature_actor_role = ? ; then a new user is created with a UUID and an ID only, but with no name nor email
+                #   the metadata is then associated with this new user
+                raise NotImplementedError(
+                    f"If actor role is 'Contact Principal': creation of a new user, if no organism retrieved nor known user from email retrieved, not implemented yet."
+                )
+                logger.warning(
+                    f"MTD - actor association impossible for {type_mtd} with UUID '{uuid_mtd}' because no id_organism nor id_role could be retrieved - with the following actor information:\n"
+                    + format_str_dict_actor_for_logging(actor)
+                )
+                continue
+        try:
+            statement = (
+                pg_insert(CorActor)
+                .values(**values)
+                .on_conflict_do_nothing(
+                    index_elements=[
+                        pk_name,
+                        "id_organism" if id_organism else "id_role",
+                        "id_nomenclature_actor_role",
+                    ],
+                )
+            )
+            DB.session.execute(statement)
+        except IntegrityError as I:
+            db.session.rollback()
+            logger.error(
+                f"MTD - DB INTEGRITY ERROR - actor association failed for {type_mtd} with UUID '{uuid_mtd}' and following actor information:\n"
+                + format_sqlalchemy_error_for_logging(I)
+                + format_str_dict_actor_for_logging(actor)
             )
-        )
-        DB.session.execute(statement)
 
 
 def associate_dataset_modules(dataset):
@@ -246,3 +327,51 @@ def associate_dataset_modules(dataset):
             )
         ).all()
     )
+
+
+def format_sqlalchemy_error_for_logging(error: SQLAlchemyError):
+    """
+    Format SQLAlchemy error information in a nice way for MTD logging
+
+    Parameters
+    ----------
+    error : SQLAlchemyError
+        the SQLAlchemy error
+
+    Returns
+    -------
+    str
+        formatted error information
+    """
+    indented_original_error_message = str(error.orig).replace("\n", "\n\t")
+
+    formatted_error_message = "".join(
+        [
+            f"\t{indented_original_error_message}",
+            f"SQL QUERY:  {error.statement}\n",
+            f"\tSQL PARAMS:  {error.params}\n",
+        ]
+    )
+
+    return formatted_error_message
+
+
+def format_str_dict_actor_for_logging(actor: dict):
+    """
+    Format actor information in a nice way for MTD logging
+
+    Parameters
+    ----------
+    actor : dict
+        actor information: actor_role, email, name, organism, uuid_organism, ...
+
+    Returns
+    -------
+    str
+        formatted actor information
+    """
+    formatted_str_dict_actor = "\tACTOR:\n\t\t" + pprint.pformat(actor).replace(
+        "\n", "\n\t\t"
+    ).rstrip("\t")
+
+    return formatted_str_dict_actor
diff --git a/backend/geonature/core/gn_meta/mtd/xml_parser.py b/backend/geonature/core/gn_meta/mtd/xml_parser.py
@@ -61,6 +61,13 @@ def parse_actors_xml(actors):
     return actor_list
 
 
+# TODO: IMPORTANT - filter the list of acquisition frameworks with `ID_INSTANCE_FILTER` as made for the list of datasets
+
+# TODO: make functions for AF and DS homogeneous: refactorize
+#   - Have functions `parse_acquisition_frameworks_xml`, and `parse_datasets_xml`, OR `parse_acquisition_framework`, and `parse_dataset`
+#   - Eventually split into distinct functions the XML parsing and the mapping of fields
+
+
 def parse_acquisition_framwork_xml(xml):
     """
     Parse an xml of AF from a string
@@ -130,10 +137,33 @@ def parse_jdd_xml(xml):
 
     root = ET.fromstring(xml, parser=_xml_parser)
     jdd_list = []
+
+    def format_acquisition_framework_id_from_xml(provided_af_uuid) -> str | None:
+        """
+        Format the acquisition framework UUID provided for the dataset
+            i.e. the value for the tag `<jdd:identifiantCadre>` in the XML file
+
+        Args:
+            provided_af_uuid (str): The acquisition framework UUID
+        Returns:
+            str | None: The formatted acquisition framework UUID, or None if none was provided
+        """
+        if not provided_af_uuid:
+            return None
+
+        if provided_af_uuid.startswith("http://oafs.fr/meta/ca/"):
+            return provided_af_uuid.split("/")[-1]
+
+        return provided_af_uuid
+
     for jdd in root.findall(".//" + namespace + "JeuDeDonnees"):
         # We extract all the required informations from the different tags of the XML file
         jdd_uuid = get_tag_content(jdd, "identifiantJdd")
-        ca_uuid = get_tag_content(jdd, "identifiantCadre")
+        # TODO: handle case where value for the tag `<jdd:identifiantCadre>` in the XML file is not of the form `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`
+        #   Solutions - if in the form `http://oafs.fr/meta/ca/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` (has some entries for INPN MTD PREPROD and instance 'Nationale') :
+        #       - (retained) Format by keeping only the `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` part
+        #       - Add a check further in the MTD sync to process only if ca_uuid is in the right format
+        ca_uuid = format_acquisition_framework_id_from_xml(get_tag_content(jdd, "identifiantCadre"))
         dataset_name = get_tag_content(jdd, "libelle")
         dataset_shortname = get_tag_content(jdd, "libelleCourt", default_value="")
         dataset_desc = get_tag_content(jdd, "description", default_value="")