Skip to content

Commit

Permalink
Deploying to gh-pages from @ 99300bc 🚀
Browse files Browse the repository at this point in the history
  • Loading branch information
BrunodePauloAlmeida committed Mar 22, 2024
1 parent 585be66 commit b5da3b3
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 14 deletions.
12 changes: 6 additions & 6 deletions rj_sme/dump_url_educacao_basica/flows.html
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,12 @@ <h1 class="title">Module <code>pipelines.rj_sme.dump_url_educacao_basica.flows</
],
)

sme_gsheets_default_parameters = {
&#34;dataset_id&#34;: &#34;educacao_basica_alocacao&#34;,
}
sme_gsheets_flow = set_default_parameters(
sme_gsheets_flow, default_parameters=sme_gsheets_default_parameters
)
# sme_gsheets_default_parameters = {
# &#34;dataset_id&#34;: &#34;educacao_basica_alocacao&#34;,
# }
# sme_gsheets_flow = set_default_parameters(
# sme_gsheets_flow, default_parameters=sme_gsheets_default_parameters
# )

sme_gsheets_flow.schedule = gsheets_year_update_schedule</code></pre>
</details>
Expand Down
85 changes: 83 additions & 2 deletions rj_sme/dump_url_educacao_basica/schedules.html
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,93 @@ <h1 class="title">Module <code>pipelines.rj_sme.dump_url_educacao_basica.schedul
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_alocacao&#34;,
}
},
&#34;bimestral_2023&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1bC-I6mT9SdRVDDL583WpeK8WOJMuIhfz/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2022&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/19PFXJKvaOrbexnt_jA4otE-LnMfHUH0H/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
&#34;encoding&#34;: &#34;latin-1&#34;,
&#34;on_bad_lines&#34;: &#34;skip&#34;,
&#34;separator&#34;: &#34;;&#34;,
},
&#34;bimestral_2021&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1k-taU8bMEYJ2U5EHvrNWQZnzN2Ht3uso/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
&#34;encoding&#34;: &#34;latin-1&#34;,
},
&#34;bimestral_2019&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1Q_drlgajGOpSsNlqw1cV2pRJ30Oh47MJ/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2018&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1b7wyFsX6T4W6U_VWIjPmJZ4HI9btaLah/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2017&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1kclQeNuzDCy0Npny1ZZLPjqiPMScw_1P/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2016&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1QH9VsphqPvFwUfE7FgQYI6YJ4TJFTptv/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2015&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1VKDnvgOzrEdT5LkNYBDE_ayVvKsj5jR0/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2014&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/18pJonyKwV210dpXr_B2M0p708jYYGwKz/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2013&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1rSi-UgB3qZDLh8U3geKRkMgSdmxddO5v/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
&#34;bimestral_2012&#34;: {
&#34;dump_mode&#34;: &#34;overwrite&#34;,
&#34;url&#34;: &#34;https://drive.google.com/file/d/1scfnos9iER86QVMx7Y_qPM1SKVv0MUED/view?usp=drive_link&#34;,
&#34;url_type&#34;: &#34;google_drive&#34;,
&#34;materialize_after_dump&#34;: True,
&#34;dataset_id&#34;: &#34;educacao_basica_avaliacao&#34;,
},
}

gsheets_clocks = generate_dump_url_schedules(
interval=timedelta(days=365),
start_date=datetime(2022, 11, 4, 20, 0, tzinfo=pytz.timezone(&#34;America/Sao_Paulo&#34;)),
start_date=datetime(2024, 3, 22, 12, 0, tzinfo=pytz.timezone(&#34;America/Sao_Paulo&#34;)),
labels=[
constants.RJ_SME_AGENT_LABEL.value,
],
Expand Down
6 changes: 6 additions & 0 deletions utils/dump_url/flows.html
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ <h1 class="title">Module <code>pipelines.utils.dump_url.flows</code></h1>

# Table parameters
partition_columns = Parameter(&#34;partition_columns&#34;, required=False, default=&#34;&#34;)
encoding = Parameter(&#34;encoding&#34;, required=False, default=&#34;utf-8&#34;)
on_bad_lines = Parameter(&#34;on_bad_lines&#34;, required=False, default=&#34;error&#34;)
separator = Parameter(&#34;separator&#34;, required=False, default=&#34;,&#34;)

# Materialization parameters
materialize_after_dump = Parameter(
Expand Down Expand Up @@ -150,6 +153,9 @@ <h1 class="title">Module <code>pipelines.utils.dump_url.flows</code></h1>
save_path=DUMP_DATA_PATH,
build_json_dataframe=build_json_dataframe,
dataframe_key_column=dataframe_key_column,
encoding=encoding,
on_bad_lines=on_bad_lines,
separator=separator,
)
DUMP_CHUNKS_TASK.set_upstream(DOWNLOAD_URL_TASK)

Expand Down
34 changes: 28 additions & 6 deletions utils/dump_url/tasks.html
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,23 @@ <h1 class="title">Module <code>pipelines.utils.dump_url.tasks</code></h1>
chunksize: int = 10**6,
build_json_dataframe: bool = False,
dataframe_key_column: str = None,
encoding: str = &#34;utf-8&#34;,
on_bad_lines: str = &#34;error&#34;,
separator: str = &#34;,&#34;,
) -&gt; None:
&#34;&#34;&#34;
Dump files according to chunk size
Dump files according to chunk size and read mode
&#34;&#34;&#34;
event_id = datetime.now().strftime(&#34;%Y%m%d-%H%M%S&#34;)
for idx, chunk in enumerate(pd.read_csv(Path(file_path), chunksize=chunksize)):
for idx, chunk in enumerate(
pd.read_csv(
Path(file_path),
chunksize=chunksize,
encoding=encoding,
on_bad_lines=on_bad_lines,
sep=separator,
)
):
log(f&#34;Dumping batch {idx} with size {chunksize}&#34;)
handle_dataframe_chunk(
dataframe=chunk,
Expand Down Expand Up @@ -349,10 +360,10 @@ <h2 id="returns">Returns</h2>
</details>
</dd>
<dt id="pipelines.utils.dump_url.tasks.dump_files"><code class="name flex">
<span>def <span class="ident">dump_files</span></span>(<span>file_path: str, partition_columns: List[str], save_path: str = '.', chunksize: int = 1000000, build_json_dataframe: bool = False, dataframe_key_column: str = None) ‑> None</span>
<span>def <span class="ident">dump_files</span></span>(<span>file_path: str, partition_columns: List[str], save_path: str = '.', chunksize: int = 1000000, build_json_dataframe: bool = False, dataframe_key_column: str = None, encoding: str = 'utf-8', on_bad_lines: str = 'error', separator: str = ',') ‑> None</span>
</code></dt>
<dd>
<div class="desc"><p>Dump files according to chunk size</p></div>
<div class="desc"><p>Dump files according to chunk size and read mode</p></div>
<details class="source">
<summary>
<span>Expand source code</span>
Expand All @@ -370,12 +381,23 @@ <h2 id="returns">Returns</h2>
chunksize: int = 10**6,
build_json_dataframe: bool = False,
dataframe_key_column: str = None,
encoding: str = &#34;utf-8&#34;,
on_bad_lines: str = &#34;error&#34;,
separator: str = &#34;,&#34;,
) -&gt; None:
&#34;&#34;&#34;
Dump files according to chunk size
Dump files according to chunk size and read mode
&#34;&#34;&#34;
event_id = datetime.now().strftime(&#34;%Y%m%d-%H%M%S&#34;)
for idx, chunk in enumerate(pd.read_csv(Path(file_path), chunksize=chunksize)):
for idx, chunk in enumerate(
pd.read_csv(
Path(file_path),
chunksize=chunksize,
encoding=encoding,
on_bad_lines=on_bad_lines,
sep=separator,
)
):
log(f&#34;Dumping batch {idx} with size {chunksize}&#34;)
handle_dataframe_chunk(
dataframe=chunk,
Expand Down
12 changes: 12 additions & 0 deletions utils/dump_url/utils.html
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,12 @@ <h1 class="title">Module <code>pipelines.utils.dump_url.utils</code></h1>
parameter_defaults[&#34;materialize_to_datario&#34;] = parameters[
&#34;materialize_to_datario&#34;
]
if &#34;encoding&#34; in parameters:
parameter_defaults[&#34;encoding&#34;] = parameters[&#34;encoding&#34;]
if &#34;on_bad_lines&#34; in parameters:
parameter_defaults[&#34;on_bad_lines&#34;] = parameters[&#34;on_bad_lines&#34;]
if &#34;separator&#34; in parameters:
parameter_defaults[&#34;separator&#34;] = parameters[&#34;separator&#34;]
# if &#34;dbt_model_secret_parameters&#34; in parameters:
# parameter_defaults[&#34;dbt_model_secret_parameters&#34;] = parameters[
# &#34;dbt_model_secret_parameters&#34;
Expand Down Expand Up @@ -263,6 +269,12 @@ <h2 class="section-title" id="header-functions">Functions</h2>
parameter_defaults[&#34;materialize_to_datario&#34;] = parameters[
&#34;materialize_to_datario&#34;
]
if &#34;encoding&#34; in parameters:
parameter_defaults[&#34;encoding&#34;] = parameters[&#34;encoding&#34;]
if &#34;on_bad_lines&#34; in parameters:
parameter_defaults[&#34;on_bad_lines&#34;] = parameters[&#34;on_bad_lines&#34;]
if &#34;separator&#34; in parameters:
parameter_defaults[&#34;separator&#34;] = parameters[&#34;separator&#34;]
# if &#34;dbt_model_secret_parameters&#34; in parameters:
# parameter_defaults[&#34;dbt_model_secret_parameters&#34;] = parameters[
# &#34;dbt_model_secret_parameters&#34;
Expand Down

0 comments on commit b5da3b3

Please sign in to comment.