bdg-tbd · zgoreck4 · Apr 20, 2024 · Apr 20, 2024 · Apr 21, 2024 · Apr 21, 2024
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
@@ -155,6 +155,7 @@ jobs:
       run: |
         infracost breakdown --path="." \
                             --format=json \
+                            --usage-file infracost-usage.yml \
                             --out-file=/tmp/infracost-base.json
     - name: Checkout PR branch
       uses: actions/checkout@v3

diff --git a/README-phase-2a.md b/README-phase-2a.md
@@ -18,9 +18,9 @@ Worth to read:
 
 2. Authors:
 
-   ***Enter your group nr***
+   7
 
-   ***Link to forked repo***
+   [***Link to forked repo***](https://github.com/a-s-gorski/tbd-workshop-1)
 
 3. Sync your repo with https://github.com/bdg-tbd/tbd-workshop-1.
 
@@ -77,19 +77,87 @@ the running instance of your Vertex AI Workbench
 
 7. Explore files created by generator and describe them, including format, content, total size.
 
-   ***Files desccription***
+   There are 3 directories Batch1 Batch2 and Batch3 with each of them containing multiple csv and text files.
+   The biggest ones are DailyMarket.txt and WatchHistory.txt with sizes of 296MB and 134MB. Directioes contain also multiple FINWIRE files varying in size
+   from 900KB to 70KB.`
 
 8. Analyze tpcdi.py. What happened in the loading stage?
 
-   ***Your answer***
+   During loading as no file_name is specified - all are read and saved to the bucket which has been set up
+   as an environement variable. This includes: DATE, DAILY_MARKET, INDUSTRY, PROSPECT, CUSTOMER_MGMT, 
+   TAX_RATE, HR, WATCH_HISTORY, TRADE, TRADE_HISTORY, STATUS_TYPE, TRADE_TYPE, HOLDING_HISTORY,
+   CASH_TRANSACTION, CMP, SEC and FINwIRE files,
 
-9. Using SparkSQL answer: how many table were created in each layer?
 
-   ***SparkSQL command and output***
+9. Using SparkSQL answer: how many table were created in each layer?
+from pyspark.sql import SparkSession
+def get_session():
+    session = SparkSession.builder \
+        .appName("TBD-TPC-DI-setup") \
+        .enableHiveSupport() \
+        .getOrCreate()
+    for db in ['digen', 'bronze', 'silver', 'gold']:
+        session.sql(f"CREATE DATABASE IF NOT EXISTS {db} LOCATION 'hdfs:///user/hive/warehouse/{db}.db'")
+    session.sql('USE digen')
+    return session
+
+query = """
+SHOW TABLES
+"""
+session = get_session()
+result_df = session.sql(query)
+result_df.show()
+
+--
++---------+----------------+-----------+
+|namespace|       tableName|isTemporary|
++---------+----------------+-----------+
+|    digen|cash_transaction|      false|
+|    digen|             cmp|      false|
+|    digen|   customer_mgmt|      false|
+|    digen|    daily_market|      false|
+|    digen|            date|      false|
+|    digen|             fin|      false|
+|    digen| holding_history|      false|
+|    digen|              hr|      false|
+|    digen|        industry|      false|
+|    digen|        prospect|      false|
+|    digen|             sec|      false|
+|    digen|     status_type|      false|
+|    digen|        tax_rate|      false|
+|    digen|           trade|      false|
+|    digen|   trade_history|      false|
+|    digen|      trade_type|      false|
+|    digen|   watch_history|      false|
++---------+----------------+-----------+
+
+There has been 17 tables created.
 
+
 10. Add some 3 more [dbt tests](https://docs.getdbt.com/docs/build/tests) and explain what you are testing. ***Add new tests to your repository.***
 
-   ***Code and description of your tests***
+   Testing if transacations are not for a negative value:
+   ```sql
+   select ct_ca_id
+   from {{ source('brokerage', 'cash_transaction') }}
+   WHERE ct_amt < 0
+
+   ```
+   Testing if there are statuses with empty status_name:
+   ```sql
+   select ct_ca_id
+   from {{ source('reference', 'status_type') }}
+   WHERE ST_NAME IS NULL
+   ```
+   Testing if there are no employees who are their own managers:
+   ```sql
+   select *
+   from {{ source('hr', 'hr') }}
+   WHERE employee_id = manager_id
+   ```
+
+
+
 
 11. In main.tf update
    ```
@@ -100,4 +168,4 @@ the running instance of your Vertex AI Workbench
 
 12. Redeploy infrastructure and check if the DAG finished with no errors:
 
-***The screenshot of Apache Aiflow UI***
+***The screenshot of Apache Aiflow UI***
diff --git a/README.md b/README.md
@@ -183,3 +183,4 @@ terraform destroy -no-color -var-file env/project.tfvars
 
 No outputs.
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
+aa
diff --git a/bootstrap/.terraform.lock.hcl b/bootstrap/.terraform.lock.hcl
diff --git a/bootstrap/main.tf b/bootstrap/main.tf
@@ -6,7 +6,6 @@ resource "google_project" "tbd_project" {
   name            = "TBD ${local.project} project"
   project_id      = local.project
   billing_account = var.billing_account
-  ## change it to break the checkov during the labs
   auto_create_network = false
   lifecycle {
     prevent_destroy = true
@@ -27,7 +26,6 @@ resource "google_project_iam_audit_config" "tbd_project_audit" {
   }
 }
 
-
 resource "google_project_service" "tbd-service" {
   project                    = google_project.tbd_project.project_id
   disable_dependent_services = true
@@ -36,7 +34,8 @@ resource "google_project_service" "tbd-service" {
     "iam.googleapis.com",
     "iamcredentials.googleapis.com",
     "serviceusage.googleapis.com",
-  "sts.googleapis.com"])
+    "sts.googleapis.com"
+  ])
   service = each.value
 }
 
@@ -45,11 +44,7 @@ resource "google_service_account" "tbd-terraform" {
   account_id = "${local.project}-lab"
 }
 
-
 resource "google_project_iam_member" "tbd-editor-supervisors" {
-  #checkov:skip=CKV_GCP_49: "Ensure no roles that enable to impersonate and manage all service accounts are used at a project level"
-  #checkov:skip=CKV_GCP_117: "Ensure basic roles are not used at project level."
-  # This is only used for workshops!!!
   for_each = toset([
     "user:[email protected]",
     "user:[email protected]",
@@ -58,32 +53,75 @@ resource "google_project_iam_member" "tbd-editor-supervisors" {
   project = google_project.tbd_project.project_id
   role    = "roles/editor"
   member  = each.value
+  checkov:skip=CKV_GCP_117: "Ensure basic roles are not used at project level."
 }
 
-
 resource "google_project_iam_member" "tbd-editor-member" {
-  #checkov:skip=CKV_GCP_49: "Ensure no roles that enable to impersonate and manage all service accounts are used at a project level"
-  #checkov:skip=CKV_GCP_117: "Ensure basic roles are not used at project level."
-  # This is only used for workshops!!!
   project = google_project.tbd_project.project_id
   role    = "roles/owner"
   member  = "serviceAccount:${google_service_account.tbd-terraform.email}"
 }
 
-
-
 resource "google_storage_bucket" "tbd-state-bucket" {
   project                     = google_project.tbd_project.project_id
   name                        = "${local.project}-state"
   location                    = var.region
-  uniform_bucket_level_access = false #tfsec:ignore:google-storage-enable-ubla
+  uniform_bucket_level_access = false
   force_destroy               = true
   lifecycle {
     prevent_destroy = true
   }
-
-  #checkov:skip=CKV_GCP_62: "Bucket should log access"
-  #checkov:skip=CKV_GCP_29: "Ensure that Cloud Storage buckets have uniform bucket-level access enabled"
-  #checkov:skip=CKV_GCP_78: "Ensure Cloud storage has versioning enabled"
   public_access_prevention = "enforced"
+  checkov:skip=CKV_GCP_103: "Ensure Dataproc Clusters do not have public IPs"
+}
+
+resource "google_dataproc_cluster" "tbd_cluster" {
+  project = google_project.tbd_project.project_id
+  name    = "${local.project}-cluster"
+  region  = var.region
+
+  cluster_config {
+    staging_bucket = google_storage_bucket.tbd-state-bucket.name
+
+    master_config {
+      num_instances = 1
+      machine_type  = "n1-highmem-4"  # High memory for master
+    }
+
+    worker_config {
+      num_instances = 2
+      machine_type  = "n1-highmem-4"  # High memory for workers
+    }
+
+    initialization_action {
+      script = "gs://dataproc-initialization-actions/conda/bootstrap-conda.sh"
+    }
+  }
+
+  labels = {
+    env = "dev"
+  }
+  checkov:skip=CKV_GCP_91: "Ensure Dataproc cluster is encrypted with Customer Supplied Encryption Keys (CSEK)"
+}
+
+resource "google_dataproc_job" "example_pyspark" {
+  project = google_project.tbd_project.project_id
+  region  = var.region
+
+  pyspark_config {
+    main_python_file_uri = "gs://path-to-your-pyspark-job.py"
+
+    properties = {
+      "spark.executor.memory"          = "3g"    # Reduced to fit within worker memory
+      "spark.executor.memoryOverhead"  = "512m"  # Adjusted overhead
+      "spark.executor.cores"           = "1"
+      "spark.driver.memory"            = "2g"    # Ensure driver memory is sufficient
+      "spark.driver.memoryOverhead"    = "512m"
+      "spark.dynamicAllocation.enabled" = "true"
+      "spark.dynamicAllocation.minExecutors" = "1"
+      "spark.dynamicAllocation.maxExecutors" = "4" # Adjusted to fit within cluster size
+    }
+  }
+
+  cluster = google_dataproc_cluster.tbd_cluster.name
 }
diff --git a/bootstrap/output.tf b/bootstrap/output.tf
@@ -10,4 +10,9 @@ output "terraform_state_bucket" {
 output "terraform_service_account" {
   value       = google_service_account.tbd-terraform.email
   description = "Terraform service account"
+}
+
+output "dataproc_cluster_name" {
+  value       = google_dataproc_cluster.tbd_cluster.name
+  description = "Dataproc cluster name"
 }
diff --git a/cicd_bootstrap/.terraform.lock.hcl b/cicd_bootstrap/.terraform.lock.hcl
diff --git a/cicd_bootstrap/conf/github_actions.tfvars b/cicd_bootstrap/conf/github_actions.tfvars
@@ -1,2 +1,2 @@
-github_org  = "bdg-tbd"
+github_org  = "a-s-gorski"
 github_repo = "tbd-workshop-1"
diff --git a/doc/figures/airflow.png b/doc/figures/airflow.png
diff --git a/doc/figures/architecture_diagram.drawio.png b/doc/figures/architecture_diagram.drawio.png
diff --git a/doc/figures/bigquery.png b/doc/figures/bigquery.png
diff --git a/doc/figures/graph.png b/doc/figures/graph.png
diff --git a/doc/figures/infracost_image1.png b/doc/figures/infracost_image1.png
diff --git a/doc/figures/infracost_image2.png b/doc/figures/infracost_image2.png
diff --git a/doc/figures/infracost_local.png b/doc/figures/infracost_local.png
diff --git a/doc/figures/release.png b/doc/figures/release.png
diff --git a/doc/figures/vertex_ai.png b/doc/figures/vertex_ai.png
diff --git a/doc/figures/yarn_ui.png b/doc/figures/yarn_ui.png
diff --git a/env/backend.tfvars b/env/backend.tfvars
@@ -1 +1 @@
-bucket = "tbd-2024l-9910-state"
+bucket = "tbd-2024l-3040540-state"
diff --git a/env/project.tfvars b/env/project.tfvars
@@ -1,3 +1,3 @@
-project_name               = "tbd-2024l-9910"
-iac_service_account        = "tbd-2024l-9910-lab@tbd-2024l-9910.iam.gserviceaccount.com"
+project_name               = "tbd-2024l-3040540"
+iac_service_account        = "tbd-2024l-3040540-lab@tbd-2024l-3040540.iam.gserviceaccount.com"
 ai_notebook_instance_owner = "[email protected]"
diff --git a/graph.png b/graph.png
diff --git a/infracost-usage.yml b/infracost-usage.yml
@@ -0,0 +1,63 @@
+version: 0.1
+
+google_artifact_registry_repository.registry:
+    storage_gb: 15 # Total data stored in the repository in GB
+    monthly_egress_data_transfer_gb: # Monthly data delivered from the artifact registry repository in GB. You can specify any number of Google Cloud regions below, replacing - for _ e.g.:
+      europe_west1: 10 # GB of data delivered from the artifact registry to europe-north1.
+
+
+google_storage_bucket.my_bucket:
+  storage_gb: 150                   # Total size of bucket in GB.
+  monthly_class_a_operations: 40 # Monthly number of class A operations (object adds, bucket/object list).
+  monthly_class_b_operations: 20 # Monthly number of class B operations (object gets, retrieve bucket/object metadata).
+  monthly_data_retrieval_gb: 5    # Monthly amount of data retrieved in GB.
+  monthly_egress_data_transfer_gb:  # Monthly data transfer from Cloud Storage to the following, in GB:
+    same_continent: 55
+
+google_storage_bucket.mlflow_artifacts_bucket:
+  storage_gb: 150                   # Total size of bucket in GB.
+  monthly_class_a_operations: 40 # Monthly number of class A operations (object adds, bucket/object list).
+  monthly_class_b_operations: 20 # Monthly number of class B operations (object gets, retrieve bucket/object metadata).
+  monthly_data_retrieval_gb: 5    # Monthly amount of data retrieved in GB.
+  monthly_egress_data_transfer_gb:  # Monthly data transfer from Cloud Storage to the following, in GB:
+    same_continent: 55
+
+
+google_storage_bucket.tbd-state-bucket:
+  storage_gb: 150                   # Total size of bucket in GB.
+  monthly_class_a_operations: 40 # Monthly number of class A operations (object adds, bucket/object list).
+  monthly_class_b_operations: 20 # Monthly number of class B operations (object gets, retrieve bucket/object metadata).
+  monthly_data_retrieval_gb: 5    # Monthly amount of data retrieved in GB.
+  monthly_egress_data_transfer_gb:  # Monthly data transfer from Cloud Storage to the following, in GB:
+    same_continent: 55
+
+google_storage_bucket.tbd-code-bucket:
+  storage_gb: 150                   # Total size of bucket in GB.
+  monthly_class_a_operations: 40 # Monthly number of class A operations (object adds, bucket/object list).
+  monthly_class_b_operations: 20 # Monthly number of class B operations (object gets, retrieve bucket/object metadata).
+  monthly_data_retrieval_gb: 5    # Monthly amount of data retrieved in GB.
+  monthly_egress_data_transfer_gb:  # Monthly data transfer from Cloud Storage to the following, in GB:
+    same_continent: 55
+
+google_storage_bucket.tbd-data-bucket:
+  storage_gb: 150                   # Total size of bucket in GB.
+  monthly_class_a_operations: 40 # Monthly number of class A operations (object adds, bucket/object list).
+  monthly_class_b_operations: 20 # Monthly number of class B operations (object gets, retrieve bucket/object metadata).
+  monthly_data_retrieval_gb: 5    # Monthly amount of data retrieved in GB.
+  monthly_egress_data_transfer_gb:  # Monthly data transfer from Cloud Storage to the following, in GB:
+    same_continent: 55
+
+google_storage_bucket.notebook-conf-bucket:
+  storage_gb: 150                   # Total size of bucket in GB.
+  monthly_class_a_operations: 40 # Monthly number of class A operations (object adds, bucket/object list).
+  monthly_class_b_operations: 20 # Monthly number of class B operations (object gets, retrieve bucket/object metadata).
+  monthly_data_retrieval_gb: 5    # Monthly amount of data retrieved in GB.
+  monthly_egress_data_transfer_gb:  # Monthly data transfer from Cloud Storage to the following, in GB:
+    same_continent: 55
+
+
+google_service_networking_connection.private_vpc_connection:
+    monthly_egress_data_transfer_gb: # Monthly VM-VM data transfer from VPN gateway to the following, in GB:
+      same_region: 250                # VMs in the same Google Cloud region.
+      europe: 70                      # Between Google Cloud regions within Europe.
+      worldwide: 200                  # to a Google Cloud region on another continent.
diff --git a/main.tf b/main.tf
@@ -12,7 +12,7 @@ locals {
   spark_blockmgr_port     = 30001
   dbt_version             = "1.7.13"
   dbt_spark_version       = "1.7.1"
-  dbt_git_repo            = "https://github.com/mwiewior/tbd-tpc-di.git"
+  dbt_git_repo            = "https://github.com/a-s-gorski/tbd-tpc-di.git"
   dbt_git_repo_branch     = "main"
 }
 
@@ -65,7 +65,6 @@ module "dataproc" {
   project_name = var.project_name
   region       = var.region
   subnet       = module.vpc.subnets[local.notebook_subnet_id].id
-  machine_type = "e2-standard-2"
 }
 
 ## Uncomment for Dataproc batches (serverless)

diff --git a/mlops/main.tf b/mlops/main.tf
@@ -25,6 +25,7 @@ resource "google_project_service" "compute" {
 
 module "gcp_vpc" {
   #checkov:skip=CKV2_GCP_18: "Ensure GCP network defines a firewall and does not use the default firewall"
+  #checkov:skip=CKV_TF_2: "Ensure Terraform module sources use a tag with a version number"
   depends_on   = [google_project_service.compute]
   source       = "terraform-google-modules/network/google"
   version      = "~> 9.0.0"
Original file line number	Diff line number	Diff line change
Expand Up		@@ -183,3 +183,4 @@ terraform destroy -no-color -var-file env/project.tfvars

		No outputs.
		<!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
		aa
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		bucket = "tbd-2024l-9910-state"
		bucket = "tbd-2024l-3040540-state"