Skip to content

Commit

Permalink
refatora sppo_infracao e sppo_licenciamento_stu para buscar do ftp (#683
Browse files Browse the repository at this point in the history
)

* refatora sppo_infracao e sppo_licenciamento_stu para buscar do ftp

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* reverte alterações no tratamento e altera get_raw

* corrige ftp_path

* corrige captura por ftp em get_raw_ftp

* corrige ftp_path e filetype em sppo_licenciamento_captura e sppo_infracao_captura

* corrige bytesIO

* ativa schedule em sppo_licenciamento_captura e sppo_infracao_captura

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored May 17, 2024
1 parent 1f85455 commit c870c3d
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 7 deletions.
1 change: 1 addition & 0 deletions pipelines/rj_smtr/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1510,6 +1510,7 @@ class constants(Enum): # pylint: disable=c0103

# INFRAÇÃO
SPPO_INFRACAO_URL = "https://siurblab.rio.rj.gov.br/SMTR/Multas/multas.txt"

SPPO_INFRACAO_MAPPING_KEYS = {
"permissao": "permissao",
"modal": "modo",
Expand Down
16 changes: 9 additions & 7 deletions pipelines/rj_smtr/veiculo/flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
create_date_hour_partition,
create_local_partition_path,
get_current_timestamp,
get_raw,
parse_timestamp_to_string,
save_raw_local,
save_treated_local,
Expand All @@ -48,6 +47,7 @@
)

from pipelines.rj_smtr.veiculo.tasks import (
get_raw_ftp,
pre_treatment_sppo_licenciamento,
pre_treatment_sppo_infracao,
get_veiculo_raw_storage,
Expand Down Expand Up @@ -97,10 +97,11 @@
csv_args=constants.SPPO_LICENCIAMENTO_CSV_ARGS.value,
)

raw_status_url = get_raw(
url=constants.SPPO_LICENCIAMENTO_URL.value,
raw_status_url = get_raw_ftp(
ftp_path="LICENCIAMENTO/CadastrodeVeiculos",
filetype="txt",
csv_args=constants.SPPO_LICENCIAMENTO_CSV_ARGS.value,
timestamp=timestamp,
)

ifelse(get_from_storage.is_equal(True), raw_status_gcs, raw_status_url)
Expand Down Expand Up @@ -140,7 +141,7 @@
image=emd_constants.DOCKER_IMAGE.value,
labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
)
# sppo_licenciamento_captura.schedule = every_day_hour_seven
sppo_licenciamento_captura.schedule = every_day_hour_seven

with Flow(
f"SMTR: {constants.VEICULO_DATASET_ID.value} {constants.SPPO_INFRACAO_TABLE_ID.value} - Captura",
Expand Down Expand Up @@ -178,10 +179,11 @@
timestamp=timestamp,
csv_args=constants.SPPO_INFRACAO_CSV_ARGS.value,
)
raw_status_url = get_raw(
url=constants.SPPO_INFRACAO_URL.value,
raw_status_url = get_raw_ftp(
ftp_path="MULTAS/MULTAS",
filetype="txt",
csv_args=constants.SPPO_INFRACAO_CSV_ARGS.value,
timestamp=timestamp,
)
ifelse(get_from_storage.is_equal(True), raw_status_gcs, raw_status_url)

Expand Down Expand Up @@ -218,7 +220,7 @@
image=emd_constants.DOCKER_IMAGE.value,
labels=[emd_constants.RJ_SMTR_AGENT_LABEL.value],
)
# sppo_infracao_captura.schedule = every_day_hour_seven
sppo_infracao_captura.schedule = every_day_hour_seven

# flake8: noqa: E501
with Flow(
Expand Down
49 changes: 49 additions & 0 deletions pipelines/rj_smtr/veiculo/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from pipelines.rj_smtr.constants import constants
from pipelines.rj_smtr.utils import (
connect_ftp,
data_info_str,
filter_data,
)
Expand Down Expand Up @@ -243,3 +244,51 @@ def pre_treatment_sppo_infracao(status: dict, timestamp: datetime):
error = exp

return {"data": data, "error": error}


@task
def get_raw_ftp(
ftp_path: str,
filetype: str,
csv_args: dict,
timestamp: datetime,
):
"""
Retrieves raw data from an FTP server.
Args:
ftp_path (str): The path to the file on the FTP server.
filetype (str): The file extension of the raw data file.
csv_args (dict): Additional arguments to be passed to the `pd.read_csv` function.
timestamp (datetime): The timestamp used to construct the file name.
Returns:
dict: A dictionary containing the retrieved data and any error messages.
The 'data' key holds the retrieved data as a list of dictionaries.
The 'error' key holds any error message encountered during the retrieval process.
"""
data = None
error = None
try:
if filetype in ("csv", "txt"):
ftp_client = connect_ftp(constants.RDO_FTPS_SECRET_PATH.value)
data = io.BytesIO()
ftp_client.retrbinary(
f"RETR {ftp_path}_{timestamp.strftime('%Y%m%d')}.{filetype}",
data.write,
)
data.seek(0)
data = pd.read_csv(
io.StringIO(data.read().decode("utf-8")),
**csv_args,
).to_dict(orient="records")
ftp_client.quit()
else:
error = "Unsupported raw file extension. Supported only: csv and txt"

except Exception:
error = traceback.format_exc()
data = None
log(f"[CATCHED] Task failed with error: \n{error}", level="error")

return {"data": data, "error": error}

0 comments on commit c870c3d

Please sign in to comment.