diff --git a/Dockerfile b/Dockerfile index 50d20976..ea86f39a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,6 +37,8 @@ COPY quotaclimat ./quotaclimat COPY postgres ./postgres COPY alembic/ ./alembic COPY transform_program.py ./transform_program.py +COPY _dbt/ ./_dbt +COPY profiles.yml ./profiles.yml # Docker compose overwrite this config to have only one Dockerfile CMD ["ls"] diff --git a/_dbt/.gitignore b/_dbt/.gitignore new file mode 100644 index 00000000..49f147cb --- /dev/null +++ b/_dbt/.gitignore @@ -0,0 +1,4 @@ + +target/ +dbt_packages/ +logs/ diff --git a/_dbt/README.md b/_dbt/README.md new file mode 100644 index 00000000..7874ac84 --- /dev/null +++ b/_dbt/README.md @@ -0,0 +1,15 @@ +Welcome to your new dbt project! + +### Using the starter project + +Try running the following commands: +- dbt run +- dbt test + + +### Resources: +- Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) +- Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers +- Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support +- Find [dbt events](https://events.getdbt.com) near you +- Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices diff --git a/_dbt/analyses/.gitkeep b/_dbt/analyses/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/dbt_project.yml b/_dbt/dbt_project.yml new file mode 100644 index 00000000..7f83ada2 --- /dev/null +++ b/_dbt/dbt_project.yml @@ -0,0 +1,38 @@ + +# Name your project! Project names should contain only lowercase characters +# and underscores. A good package name should reflect your organization's +# name or the intended use of these models +name: '_dbt' +version: '1.0.0' + +# This setting configures which "profile" dbt uses for this project. +profile: '_dbt' + +# These configurations specify where dbt should look for different types of files. +# The `model-paths` config, for example, states that models in this project can be +# found in the "models/" directory. You probably won't need to change these! +model-paths: ["models"] +analysis-paths: ["analyses"] +test-paths: ["tests"] +seed-paths: ["seeds"] +macro-paths: ["macros"] +snapshot-paths: ["snapshots"] + +clean-targets: # directories to be removed by `dbt clean` + - "target" + - "dbt_packages" + + +# Configuring models +# Full documentation: https://docs.getdbt.com/docs/configuring-models + +# In this example config, we tell dbt to build all models in the example/ +# directory as views. These settings can be overridden in the individual model +# files using the `{{ config(...) }}` macro. +models: + _dbt: + # Config indicated by + and applies to all files under models/example/ + staging: + +materialized: view + intermediate: + +materialized: view diff --git a/_dbt/macros/.gitkeep b/_dbt/macros/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/models/intermediate/int_keywords_aggregated_by_days_and_channel.sql b/_dbt/models/intermediate/int_keywords_aggregated_by_days_and_channel.sql new file mode 100644 index 00000000..1ca3d0c7 --- /dev/null +++ b/_dbt/models/intermediate/int_keywords_aggregated_by_days_and_channel.sql @@ -0,0 +1,19 @@ +/* L'étape 'staging' est utilisé pour réaliser des opérations de transformations plus avancées (join, group by, ...) : +https://docs.getdbt.com/best-practices/how-we-structure/3-intermediate */ + +with keywords as ( + select + * + from {{ ref('stg_keywords') }} +), + +keywords_grouped_by_days_and_channels as ( + select + channel_title, + start::date as start, + count(*) + from keywords + group by 1, 2 +) + +select * from keywords_grouped_by_days_and_channels \ No newline at end of file diff --git a/_dbt/models/staging/_schema.yml b/_dbt/models/staging/_schema.yml new file mode 100644 index 00000000..d02bd640 --- /dev/null +++ b/_dbt/models/staging/_schema.yml @@ -0,0 +1,21 @@ +version: 2 + +models: + - name: stg_keywords + description: List of keywords said during channel_program + columns: + - name: keyword_id + description: "The primary key for this table" + data_tests: + - unique + - not_null + + - name: updated_at + description: "Last date when keywords have been updated" +# data_tests: +# - not_null + + - name: channel_title + description: "Title of the channel" +# data_tests: # Uncomment to trigger data test error +# - not_null \ No newline at end of file diff --git a/_dbt/models/staging/_sources.yaml b/_dbt/models/staging/_sources.yaml new file mode 100644 index 00000000..9613d101 --- /dev/null +++ b/_dbt/models/staging/_sources.yaml @@ -0,0 +1,8 @@ +version: 2 + +sources: + - name: quotaclimat + database: barometre + schema: public + tables: + - name: keywords \ No newline at end of file diff --git a/_dbt/models/staging/stg_keywords.sql b/_dbt/models/staging/stg_keywords.sql new file mode 100644 index 00000000..57bae473 --- /dev/null +++ b/_dbt/models/staging/stg_keywords.sql @@ -0,0 +1,45 @@ +-- L'étape 'staging' est utilisé pour réaliser des opérations de nettoyage basiques : https://docs.getdbt.com/best-practices/how-we-structure/2-staging + +with keywords as ( + select * from {{ source('quotaclimat', 'keywords') }} +), + +renamed as ( + select + id as keyword_id, + case + when channel_name = '' then null + else channel_name + end as channel_name, + case + when channel_title = '' then null + else channel_title + end as channel_title, + case + when channel_program = '' then null + else channel_program + end as channel_program, + case + when channel_program_type = '' then null + else channel_program_type + end as channel_program_type, + start, + TRIM(REPLACE(plaintext, '', '')) as plain_text, + theme, + case + when theme::text like '%changement_climatique_constat_indirectes%' then TRUE + else FALSE + end as is_climatic_change_subject, + case + when theme::text like '%biodiversite_concepts_generaux_indirectes%' then TRUE + else FALSE + end as is_biodiversity_general_indirect_concept, + created_at, + updated_at::timestamp with time zone as updated_at, + keywords_with_timestamp->1->>'keywords' as first_keyword, + to_timestamp((keywords_with_timestamp->0->>'timestamp')::bigint / 1000) AS first_keyword_date, + json_array_length(keywords_with_timestamp) as keywords_count + from keywords +) + +select * from renamed \ No newline at end of file diff --git a/_dbt/seeds/.gitkeep b/_dbt/seeds/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/snapshots/.gitkeep b/_dbt/snapshots/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/_dbt/tests/.gitkeep b/_dbt/tests/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/docker-compose.yml b/docker-compose.yml index fb995fcb..e68f6d2c 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,6 +26,8 @@ services: - ./pyproject.toml:/app/pyproject.toml - ./alembic:/app/alembic - ./alembic.ini:/app/alembic.ini + - ./_dbt:/app/_dbt + - ./profiles.yml:/app/profiles.yml depends_on: nginxtest: condition: service_healthy @@ -107,6 +109,31 @@ services: postgres_db: condition: service_healthy + dbt_runner: + build: + context: ./ + dockerfile: Dockerfile + entrypoint: [ "poetry", "run", "dbt", "build", "--project-dir", "/app/_dbt" ] + environment: + ENV: docker + PYTHONPATH: /app + POSTGRES_USER: user + POSTGRES_DB: barometre + POSTGRES_PASSWORD: password + POSTGRES_HOST: postgres_db + POSTGRES_PORT: 5432 + tty: true + volumes: + - ./quotaclimat/:/app/quotaclimat/ + - ./postgres/:/app/postgres/ + - ./_dbt:/app/_dbt + - ./profiles.yml:/app/profiles.yml + networks: + - db_network + depends_on: + postgres_db: + condition: service_healthy + postgres_db: image: postgres:15 ports: @@ -124,6 +151,21 @@ services: POSTGRES_PASSWORD: password logging: # no logs for postgres container driver: none + networks: + - db_network + + pgadmin: + image: dpage/pgadmin4 + container_name: pgadmin + environment: + PGADMIN_DEFAULT_EMAIL: admin@example.com + PGADMIN_DEFAULT_PASSWORD: admin_password + ports: + - "8080:80" + networks: + - db_network + depends_on: + - postgres_db mediatree: ports: @@ -196,6 +238,10 @@ services: postgres_db: condition: service_healthy +networks: + db_network: + driver: bridge + secrets: # https://docs.docker.com/compose/use-secrets/ pwd_api: file: secrets/pwd_api.txt diff --git a/profiles.yml b/profiles.yml new file mode 100644 index 00000000..703006ac --- /dev/null +++ b/profiles.yml @@ -0,0 +1,12 @@ +_dbt: + target: dev + outputs: + dev: + type: postgres + host: postgres_db + user: user + password: password + port: 5432 + dbname: barometre + schema: public + threads: 1 diff --git a/pyproject.toml b/pyproject.toml index 4b3a4bb5..ff2833d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,8 @@ sentry-sdk = "^2.13.0" modin = {extras = ["ray"], version = "^0.32.0"} numpy = "1.26.4" openpyxl = "^3.1.5" +dbt-core = "^1.8.7" +dbt-postgres = "^1.8.2" [build-system] requires = ["poetry-core>=1.1"] build-backend = "poetry.core.masonry.api"