Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rdrp 1015 s3 config: s3 parameters are now read from the dev config, not hardcoded. #353

Merged
merged 3 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/dev_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ config_validation:
global:
# Logging settings
log_to_file: True # Write logs to .log file
logging_level: "DEBUG"
logging_level: "INFO"
table_config: "SingleLine"
# Environment settings
dev_test : False
dev_test : True
platform: network #whether to load from hdfs, network (Windows) or s3 (CDP)
load_from_feather: False
runlog_writer:
Expand Down
5 changes: 3 additions & 2 deletions src/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ def run_pipeline(user_config_path, dev_config_path):
if platform == "s3":
# create singletion boto3 client object & pass in bucket string
from src.utils.singleton_boto import SingletonBoto
from src.utils.singleton_config import SingletonConfig

boto3_client = SingletonBoto.get_client() # noqa

boto3_client = SingletonBoto.get_client(config)
s3_bucket = SingletonConfig.get_config(config)
from src.utils import s3_mods as mods

# Creating boto3 client and adding it to the config dict
Expand Down
2 changes: 1 addition & 1 deletion src/user_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ s3_paths:
freezing_additions_path: "02_freezing/freezing_updates"
freezing_amendments_path: "02_freezing/freezing_updates"
# Imputation and outliers input paths
backdata_path: "2022_surveys/BERD/06_imputation/backdata_output/2022_backdata_anon.csv"
backdata_path: "/bat/res_dev/project_data/2021_surveys/BERD/06_imputation/backdata_output/2021_backdata_published_v347_anon.csv"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hi George, I've corrected this, but it looks like you've put it back to how it was before. I suggest we have 2022 data for 2023 imputation, not 2021.

manual_imp_trim_path: "06_imputation/manual_trimming/trimming_qa_2023-11-27_v359.csv"
manual_outliers_path: "07_outliers/manual_outliers/manual_outlier_2023-08-29_v67.csv"
# Construction paths
Expand Down
22 changes: 8 additions & 14 deletions src/utils/s3_mods.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,15 @@
import pandas as pd
from io import StringIO

# Third party libraries specific to s3 bucket
# import boto3
# import raz_client

# Local libraries
from rdsa_utils.cdp.helpers.s3_utils import file_exists, create_folder_on_s3
from src.utils.singleton_boto import SingletonBoto
from src.utils.singleton_config import SingletonConfig

# set up logging
# set up logging, boto3 client and s3 bucket
s3_logger = logging.getLogger(__name__)
s3_client = SingletonBoto.get_client()

ssl_file_dev = "/etc/pki/tls/certs/ca-bundle.crt"
s3_bucket_dev = "onscdp-dev-data01-5320d6ca"

s3_bucket = SingletonConfig.get_config()

# Read a CSV file into a Pandas dataframe
def rd_read_csv(filepath: str, **kwargs) -> pd.DataFrame:
Expand All @@ -61,7 +55,7 @@ def rd_read_csv(filepath: str, **kwargs) -> pd.DataFrame:
pd.DataFrame: Dataframe created from csv
"""

with s3_client.get_object(Bucket=s3_bucket_dev, Key=filepath)["Body"] as file:
with s3_client.get_object(Bucket=s3_bucket, Key=filepath)["Body"] as file:
# If "thousands" argument is not specified, set it to ","
if "thousands" not in kwargs:
kwargs["thousands"] = ","
Expand Down Expand Up @@ -99,7 +93,7 @@ def rd_write_csv(filepath: str, data: pd.DataFrame) -> None:

# Write the buffer into the s3 bucket
_ = s3_client.put_object(
Bucket=s3_bucket_dev, Body=csv_buffer.getvalue(), Key=filepath
Bucket=s3_bucket, Body=csv_buffer.getvalue(), Key=filepath
)
return None

Expand All @@ -115,7 +109,7 @@ def rd_load_json(filepath: str) -> dict:
"""

# Load the json file using the client method
with s3_client.get_object(Bucket=s3_bucket_dev, Key=filepath)["Body"] as json_file:
with s3_client.get_object(Bucket=s3_bucket, Key=filepath)["Body"] as json_file:
datadict = json.load(json_file)

return datadict
Expand All @@ -136,7 +130,7 @@ def rd_file_exists(filepath: str, raise_error=False) -> bool:
"""

result = file_exists(
client=s3_client, bucket_name=s3_bucket_dev, object_name=filepath
client=s3_client, bucket_name=s3_bucket, object_name=filepath
)

if not result and raise_error:
Expand All @@ -158,7 +152,7 @@ def rd_mkdir(path: str) -> None:
_ = create_folder_on_s3(
# client=config["client"],
s3_client,
bucket_name=s3_bucket_dev,
bucket_name=s3_bucket,
folder_path=path,
)

Expand Down
10 changes: 3 additions & 7 deletions src/utils/singleton_boto.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,19 @@

class SingletonBoto:
_instance = None
# _bucket_str = None

def __init__(self):
raise RuntimeError("This is a Singleton, invoke get_client() instead.")


@classmethod
def get_client(cls): # , bucket_str= None):
def get_client(cls, config={}): # , bucket_str= None):
if cls._instance is None:
client = boto3.client("s3")
raz_client.configure_ranger_raz(
client,
ssl_file="/etc/pki/tls/certs/ca-bundle.crt"
ssl_file=config["s3"]["ssl_file"]
)
Bucket=config["s3"]["s3_bucket"]
cls._instance = client
# cls._bucket_str = bucket_str
return cls._instance

# def get_bucket():
# return cls._bucket_str
21 changes: 21 additions & 0 deletions src/utils/singleton_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""
A class that initialises a single instance of boto3 client
"""
import boto3
import raz_client


class SingletonConfig:
_instance = None

def __init__(self):
raise RuntimeError("This is a Singleton, invoke get_config() instead.")


@classmethod
def get_config(cls, config={}): # , bucket_str= None):
if cls._instance is None:
Bucket=config["s3"]["s3_bucket"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain why Bucket has a capital, and why it shows as a different colour here?

cls._instance = Bucket
return cls._instance

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm really impressed you got this to work !
A couple of questions though: could this go in the same script as singleton_boto.py?
Could this second method be a method on the boto class? That was what I was trying to do.
Important thing is to get it working in Mig.

Loading