From 1bcfcf317a7b0d047d0e08be19fc22036a1ca857 Mon Sep 17 00:00:00 2001 From: julien Date: Thu, 26 Sep 2024 12:41:37 +0200 Subject: [PATCH 1/8] Add dbt for data modeling --- Dockerfile | 2 ++ docker-compose.yml | 25 +++++++++++++++++++++++++ pyproject.toml | 2 ++ 3 files changed, 29 insertions(+) diff --git a/Dockerfile b/Dockerfile index 50d20976..ea86f39a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,6 +37,8 @@ COPY quotaclimat ./quotaclimat COPY postgres ./postgres COPY alembic/ ./alembic COPY transform_program.py ./transform_program.py +COPY _dbt/ ./_dbt +COPY profiles.yml ./profiles.yml # Docker compose overwrite this config to have only one Dockerfile CMD ["ls"] diff --git a/docker-compose.yml b/docker-compose.yml index fb995fcb..af9227e6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,6 +26,8 @@ services: - ./pyproject.toml:/app/pyproject.toml - ./alembic:/app/alembic - ./alembic.ini:/app/alembic.ini + - ./_dbt:/app/_dbt + - ./profiles.yml:/app/profiles.yml depends_on: nginxtest: condition: service_healthy @@ -107,6 +109,29 @@ services: postgres_db: condition: service_healthy + dbt_runner: + build: + context: ./ + dockerfile: Dockerfile + entrypoint: [ "poetry", "run", "dbt", "run", "--project-dir", "/app/_dbt" ] + environment: + ENV: docker + PYTHONPATH: /app + POSTGRES_USER: user + POSTGRES_DB: barometre + POSTGRES_PASSWORD: password + POSTGRES_HOST: postgres_db + POSTGRES_PORT: 5432 + tty: true + volumes: + - ./quotaclimat/:/app/quotaclimat/ + - ./postgres/:/app/postgres/ + - ./_dbt:/app/_dbt + - ./profiles.yml:/app/profiles.yml + depends_on: + postgres_db: + condition: service_healthy + postgres_db: image: postgres:15 ports: diff --git a/pyproject.toml b/pyproject.toml index 4b3a4bb5..ff2833d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,8 @@ sentry-sdk = "^2.13.0" modin = {extras = ["ray"], version = "^0.32.0"} numpy = "1.26.4" openpyxl = "^3.1.5" +dbt-core = "^1.8.7" +dbt-postgres = "^1.8.2" [build-system] requires = ["poetry-core>=1.1"] build-backend = "poetry.core.masonry.api" From 37d191e2a1aa5d9d17549707e27d0eb58a842850 Mon Sep 17 00:00:00 2001 From: julien Date: Fri, 27 Sep 2024 09:01:42 +0200 Subject: [PATCH 2/8] Add dbt for data modeling --- _dbt/.gitignore | 4 +++ _dbt/README.md | 15 +++++++++ _dbt/analyses/.gitkeep | 0 _dbt/dbt_project.yml | 36 +++++++++++++++++++++ _dbt/macros/.gitkeep | 0 _dbt/models/example/my_first_dbt_model.sql | 27 ++++++++++++++++ _dbt/models/example/my_second_dbt_model.sql | 6 ++++ _dbt/models/example/schema.yml | 21 ++++++++++++ _dbt/seeds/.gitkeep | 0 _dbt/snapshots/.gitkeep | 0 _dbt/tests/.gitkeep | 0 profiles.yml | 12 +++++++ 12 files changed, 121 insertions(+) create mode 100644 _dbt/.gitignore create mode 100644 _dbt/README.md create mode 100644 _dbt/analyses/.gitkeep create mode 100644 _dbt/dbt_project.yml create mode 100644 _dbt/macros/.gitkeep create mode 100644 _dbt/models/example/my_first_dbt_model.sql create mode 100644 _dbt/models/example/my_second_dbt_model.sql create mode 100644 _dbt/models/example/schema.yml create mode 100644 _dbt/seeds/.gitkeep create mode 100644 _dbt/snapshots/.gitkeep create mode 100644 _dbt/tests/.gitkeep create mode 100644 profiles.yml diff --git a/_dbt/.gitignore b/_dbt/.gitignore new file mode 100644 index 00000000..49f147cb --- /dev/null +++ b/_dbt/.gitignore @@ -0,0 +1,4 @@ + +target/ +dbt_packages/ +logs/ diff --git a/_dbt/README.md b/_dbt/README.md new file mode 100644 index 00000000..7874ac84 --- /dev/null +++ b/_dbt/README.md @@ -0,0 +1,15 @@ +Welcome to your new dbt project! + +### Using the starter project + +Try running the following commands: +- dbt run +- dbt test + + +### Resources: +- Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) +- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers +- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support +- Find [dbt events](https://events.getdbt.com) near you +- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices diff --git a/_dbt/analyses/.gitkeep b/_dbt/analyses/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/dbt_project.yml b/_dbt/dbt_project.yml new file mode 100644 index 00000000..375fa65d --- /dev/null +++ b/_dbt/dbt_project.yml @@ -0,0 +1,36 @@ + +# Name your project! Project names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: '_dbt' +version: '1.0.0' + +# This setting configures which "profile" dbt uses for this project. +profile: '_dbt' + +# These configurations specify where dbt should look for different types of files. +# The `model-paths` config, for example, states that models in this project can be +# found in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] +analysis-paths: ["analyses"] +test-paths: ["tests"] +seed-paths: ["seeds"] +macro-paths: ["macros"] +snapshot-paths: ["snapshots"] + +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + + +# Configuring models +# Full documentation: https://docs.getdbt.com/docs/configuring-models + +# In this example config, we tell dbt to build all models in the example/ +# directory as views. These settings can be overridden in the individual model +# files using the `{{ config(...) }}` macro. +models: + _dbt: + # Config indicated by + and applies to all files under models/example/ + example: + +materialized: view diff --git a/_dbt/macros/.gitkeep b/_dbt/macros/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/models/example/my_first_dbt_model.sql b/_dbt/models/example/my_first_dbt_model.sql new file mode 100644 index 00000000..f31a12d9 --- /dev/null +++ b/_dbt/models/example/my_first_dbt_model.sql @@ -0,0 +1,27 @@ + +/* + Welcome to your first dbt model! + Did you know that you can also configure models directly within SQL files? + This will override configurations stated in dbt_project.yml + + Try changing "table" to "view" below +*/ + +{{ config(materialized='table') }} + +with source_data as ( + + select 1 as id + union all + select null as id + +) + +select * +from source_data + +/* + Uncomment the line below to remove records with null `id` values +*/ + +-- where id is not null diff --git a/_dbt/models/example/my_second_dbt_model.sql b/_dbt/models/example/my_second_dbt_model.sql new file mode 100644 index 00000000..c91f8793 --- /dev/null +++ b/_dbt/models/example/my_second_dbt_model.sql @@ -0,0 +1,6 @@ + +-- Use the `ref` function to select from other models + +select * +from {{ ref('my_first_dbt_model') }} +where id = 1 diff --git a/_dbt/models/example/schema.yml b/_dbt/models/example/schema.yml new file mode 100644 index 00000000..9730b707 --- /dev/null +++ b/_dbt/models/example/schema.yml @@ -0,0 +1,21 @@ + +version: 2 + +models: + - name: my_first_dbt_model + description: "A starter dbt model" + columns: + - name: id + description: "The primary key for this table" + data_tests: + - unique + - not_null + + - name: my_second_dbt_model + description: "A starter dbt model" + columns: + - name: id + description: "The primary key for this table" + data_tests: + - unique + - not_null diff --git a/_dbt/seeds/.gitkeep b/_dbt/seeds/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/snapshots/.gitkeep b/_dbt/snapshots/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/tests/.gitkeep b/_dbt/tests/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/profiles.yml b/profiles.yml new file mode 100644 index 00000000..703006ac --- /dev/null +++ b/profiles.yml @@ -0,0 +1,12 @@ +_dbt: + target: dev + outputs: + dev: + type: postgres + host: postgres_db + user: user + password: password + port: 5432 + dbname: barometre + schema: public + threads: 1 From 64fa5d586f4a3f4ffaed8d1236df68469bec1a7d Mon Sep 17 00:00:00 2001 From: "julien.vansteenkiste" Date: Wed, 2 Oct 2024 15:48:34 +0200 Subject: [PATCH 3/8] Remove dbt examples --- _dbt/models/example/my_first_dbt_model.sql | 27 --------------------- _dbt/models/example/my_second_dbt_model.sql | 6 ----- _dbt/models/example/schema.yml | 21 ---------------- 3 files changed, 54 deletions(-) delete mode 100644 _dbt/models/example/my_first_dbt_model.sql delete mode 100644 _dbt/models/example/my_second_dbt_model.sql delete mode 100644 _dbt/models/example/schema.yml diff --git a/_dbt/models/example/my_first_dbt_model.sql b/_dbt/models/example/my_first_dbt_model.sql deleted file mode 100644 index f31a12d9..00000000 --- a/_dbt/models/example/my_first_dbt_model.sql +++ /dev/null @@ -1,27 +0,0 @@ - -/* - Welcome to your first dbt model! - Did you know that you can also configure models directly within SQL files? - This will override configurations stated in dbt_project.yml - - Try changing "table" to "view" below -*/ - -{{ config(materialized='table') }} - -with source_data as ( - - select 1 as id - union all - select null as id - -) - -select * -from source_data - -/* - Uncomment the line below to remove records with null `id` values -*/ - --- where id is not null diff --git a/_dbt/models/example/my_second_dbt_model.sql b/_dbt/models/example/my_second_dbt_model.sql deleted file mode 100644 index c91f8793..00000000 --- a/_dbt/models/example/my_second_dbt_model.sql +++ /dev/null @@ -1,6 +0,0 @@ - --- Use the `ref` function to select from other models - -select * -from {{ ref('my_first_dbt_model') }} -where id = 1 diff --git a/_dbt/models/example/schema.yml b/_dbt/models/example/schema.yml deleted file mode 100644 index 9730b707..00000000 --- a/_dbt/models/example/schema.yml +++ /dev/null @@ -1,21 +0,0 @@ - -version: 2 - -models: - - name: my_first_dbt_model - description: "A starter dbt model" - columns: - - name: id - description: "The primary key for this table" - data_tests: - - unique - - not_null - - - name: my_second_dbt_model - description: "A starter dbt model" - columns: - - name: id - description: "The primary key for this table" - data_tests: - - unique - - not_null From 1f0f27d5c0dbf84ae1b7b8cd5e46b2f132893313 Mon Sep 17 00:00:00 2001 From: "julien.vansteenkiste" Date: Wed, 2 Oct 2024 15:49:17 +0200 Subject: [PATCH 4/8] Add metadata for keywords table --- _dbt/models/staging/_schema.yml | 21 +++++++++++++++++++++ _dbt/models/staging/_sources.yaml | 8 ++++++++ 2 files changed, 29 insertions(+) create mode 100644 _dbt/models/staging/_schema.yml create mode 100644 _dbt/models/staging/_sources.yaml diff --git a/_dbt/models/staging/_schema.yml b/_dbt/models/staging/_schema.yml new file mode 100644 index 00000000..d02bd640 --- /dev/null +++ b/_dbt/models/staging/_schema.yml @@ -0,0 +1,21 @@ +version: 2 + +models: + - name: stg_keywords + description: List of keywords said during channel_program + columns: + - name: keyword_id + description: "The primary key for this table" + data_tests: + - unique + - not_null + + - name: updated_at + description: "Last date when keywords have been updated" +# data_tests: +# - not_null + + - name: channel_title + description: "Title of the channel" +# data_tests: # Uncomment to trigger data test error +# - not_null \ No newline at end of file diff --git a/_dbt/models/staging/_sources.yaml b/_dbt/models/staging/_sources.yaml new file mode 100644 index 00000000..9613d101 --- /dev/null +++ b/_dbt/models/staging/_sources.yaml @@ -0,0 +1,8 @@ +version: 2 + +sources: + - name: quotaclimat + database: barometre + schema: public + tables: + - name: keywords \ No newline at end of file From 8546566670265de4157f1c0424b437522f71cec5 Mon Sep 17 00:00:00 2001 From: "julien.vansteenkiste" Date: Wed, 2 Oct 2024 15:49:39 +0200 Subject: [PATCH 5/8] Add pgadmin and update dbt run command --- docker-compose.yml | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index af9227e6..e68f6d2c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -113,7 +113,7 @@ services: build: context: ./ dockerfile: Dockerfile - entrypoint: [ "poetry", "run", "dbt", "run", "--project-dir", "/app/_dbt" ] + entrypoint: [ "poetry", "run", "dbt", "build", "--project-dir", "/app/_dbt" ] environment: ENV: docker PYTHONPATH: /app @@ -128,6 +128,8 @@ services: - ./postgres/:/app/postgres/ - ./_dbt:/app/_dbt - ./profiles.yml:/app/profiles.yml + networks: + - db_network depends_on: postgres_db: condition: service_healthy @@ -149,6 +151,21 @@ services: POSTGRES_PASSWORD: password logging: # no logs for postgres container driver: none + networks: + - db_network + + pgadmin: + image: dpage/pgadmin4 + container_name: pgadmin + environment: + PGADMIN_DEFAULT_EMAIL: admin@example.com + PGADMIN_DEFAULT_PASSWORD: admin_password + ports: + - "8080:80" + networks: + - db_network + depends_on: + - postgres_db mediatree: ports: @@ -221,6 +238,10 @@ services: postgres_db: condition: service_healthy +networks: + db_network: + driver: bridge + secrets: # https://docs.docker.com/compose/use-secrets/ pwd_api: file: secrets/pwd_api.txt From 44d23c7546a60963a78ce36fd6d63f0860de7010 Mon Sep 17 00:00:00 2001 From: "julien.vansteenkiste" Date: Wed, 2 Oct 2024 15:50:20 +0200 Subject: [PATCH 6/8] Define dbt models materialization mode --- _dbt/dbt_project.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/_dbt/dbt_project.yml b/_dbt/dbt_project.yml index 375fa65d..7f83ada2 100644 --- a/_dbt/dbt_project.yml +++ b/_dbt/dbt_project.yml @@ -32,5 +32,7 @@ clean-targets: # directories to be removed by `dbt clean` models: _dbt: # Config indicated by + and applies to all files under models/example/ - example: + staging: + +materialized: view + intermediate: +materialized: view From a358722e2fa23e4dba7cf19cf7b31a2476eb0907 Mon Sep 17 00:00:00 2001 From: "julien.vansteenkiste" Date: Wed, 2 Oct 2024 15:50:51 +0200 Subject: [PATCH 7/8] Add example of staging model for keyword table --- _dbt/models/staging/stg_keywords.sql | 45 ++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 _dbt/models/staging/stg_keywords.sql diff --git a/_dbt/models/staging/stg_keywords.sql b/_dbt/models/staging/stg_keywords.sql new file mode 100644 index 00000000..57bae473 --- /dev/null +++ b/_dbt/models/staging/stg_keywords.sql @@ -0,0 +1,45 @@ +-- L'étape 'staging' est utilisé pour réaliser des opérations de nettoyage basiques : https://docs.getdbt.com/best-practices/how-we-structure/2-staging + +with keywords as ( + select * from {{ source('quotaclimat', 'keywords') }} +), + +renamed as ( + select + id as keyword_id, + case + when channel_name = '' then null + else channel_name + end as channel_name, + case + when channel_title = '' then null + else channel_title + end as channel_title, + case + when channel_program = '' then null + else channel_program + end as channel_program, + case + when channel_program_type = '' then null + else channel_program_type + end as channel_program_type, + start, + TRIM(REPLACE(plaintext, '', '')) as plain_text, + theme, + case + when theme::text like '%changement_climatique_constat_indirectes%' then TRUE + else FALSE + end as is_climatic_change_subject, + case + when theme::text like '%biodiversite_concepts_generaux_indirectes%' then TRUE + else FALSE + end as is_biodiversity_general_indirect_concept, + created_at, + updated_at::timestamp with time zone as updated_at, + keywords_with_timestamp->1->>'keywords' as first_keyword, + to_timestamp((keywords_with_timestamp->0->>'timestamp')::bigint / 1000) AS first_keyword_date, + json_array_length(keywords_with_timestamp) as keywords_count + from keywords +) + +select * from renamed \ No newline at end of file From 43c915d295ca68fdf56b95c94efcb28fd16ec661 Mon Sep 17 00:00:00 2001 From: "julien.vansteenkiste" Date: Wed, 2 Oct 2024 15:51:00 +0200 Subject: [PATCH 8/8] Add example of intermediate model for keyword table --- ...eywords_aggregated_by_days_and_channel.sql | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 _dbt/models/intermediate/int_keywords_aggregated_by_days_and_channel.sql diff --git a/_dbt/models/intermediate/int_keywords_aggregated_by_days_and_channel.sql b/_dbt/models/intermediate/int_keywords_aggregated_by_days_and_channel.sql new file mode 100644 index 00000000..1ca3d0c7 --- /dev/null +++ b/_dbt/models/intermediate/int_keywords_aggregated_by_days_and_channel.sql @@ -0,0 +1,19 @@ +/* L'étape 'staging' est utilisé pour réaliser des opérations de transformations plus avancées (join, group by, ...) : +https://docs.getdbt.com/best-practices/how-we-structure/3-intermediate */ + +with keywords as ( + select + * + from {{ ref('stg_keywords') }} +), + +keywords_grouped_by_days_and_channels as ( + select + channel_title, + start::date as start, + count(*) + from keywords + group by 1, 2 +) + +select * from keywords_grouped_by_days_and_channels \ No newline at end of file