-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate-raw-data.py
57 lines (45 loc) · 2.68 KB
/
generate-raw-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import logging
from pathlib import Path
import duckdb
import numpy as np
import pandas as pd
from mimesis import Generic
logging.basicConfig(
level=logging.INFO, datefmt='%H:%M:%S',
format='%(asctime)s-%(levelname)s-%(name)s::%(module)s|%(lineno)s:: %(message)s'
)
if __name__ == '__main__':
faker = Generic(seed=1)
num_accounts = 30
accounts = pd.DataFrame({'Id': [f'a{i:05d}' for i in range(num_accounts)],
'CompanyName': [faker.finance.company() for _ in range(num_accounts)],
'BillingRegion': [faker.address.continent() for _ in range(num_accounts)]})
# Every customer should have at least one site, some have more:
num_extra_sites = 20
sites = pd.DataFrame({'Id': [f's{i:05d}' for i in range(num_accounts + num_extra_sites)]})
sites['SiteOwnerId'] = list(accounts['Id']) + list(accounts['Id'].sample(num_extra_sites, replace=True))
sites['Region'] = list(accounts['BillingRegion']) + list([faker.address.continent() for _ in range(num_extra_sites)])
num_orders = 50000
orders = pd.DataFrame({'Id': [f'o{i:05d}' for i in range(num_orders)]})
orders['SiteIdDelivery'] = sites['Id'].sample(num_orders, replace=True).values
orders['DateBooked'] = [faker.datetime.date(start=2020, end=2022) for _ in range(num_orders)]
orders['CurrencyCode'] = np.random.choice(['USD', 'EUR', 'GBP'], num_orders, replace=True, p=[0.7, 0.2, 0.1])
orders['Price'] = faker.numeric.floats(start=0, end=100, precision=2, n=num_orders)
orders['_percent'] = faker.numeric.floats(start=0, end=0.95, n=num_orders)
orders['Cost'] = orders['Price'].mul(orders['_percent']).round(2)
orders['Type'] = np.random.choice(['Installation', 'Service', 'Parts'], num_orders, replace=True, p=[0.4, 0.3, 0.3])
orders = orders.drop('_percent', axis=1)
fx_rates = pd.DataFrame({'Year': [2020, 2021, 2022, 2020, 2021, 2022],
'Currency': ['Euro', 'Euro', 'Euro', 'Pound Sterling', 'Pound Sterling', 'Pound Sterling'],
'Rate': [1.1, 1.2, 1.15, 1.3, 1.4, 1.2]})
logging.info('Synthetic data generated.')
path_duck_db = Path(__file__).parent / '../data/data-mart.duckdb'
if not path_duck_db.parent.exists():
path_duck_db.parent.mkdir()
con = duckdb.connect(database=str(path_duck_db), read_only=False)
logging.info(f'Connection to {path_duck_db} established.')
con.query("CREATE SCHEMA IF NOT EXISTS raw")
for table_name in ('accounts', 'sites', 'orders', 'fx_rates'):
query = f"DROP TABLE IF EXISTS raw.{table_name}; CREATE TABLE raw.{table_name} AS SELECT * FROM {table_name}"
logging.info(query)
con.query(query=query)