aws-samples · james-jory · Jun 19, 2020 · May 29, 2020 · May 29, 2020 · May 29, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,6 @@
 __pycache__/
 .vscode
 build/*
-*.zip
 workshop/*/.ipynb_checkpoints
 workshop/1-Personalization/interactions.csv
 workshop/1-Personalization/items.csv
@@ -16,4 +15,8 @@ demo.md
 generators/*.gz
 /csvs/
 .unotes/*
-!workshop/5-Conversational/RetailDemoStore_Lex.zip
+!workshop/5-Conversational/RetailDemoStore_Lex.zip
+*.zip
+workshop/data/*
+workshop/datagenerator/*
+workshop/requirements.txt
diff --git a/generators/README.md b/generators/README.md
@@ -0,0 +1,17 @@
+# User Data Generator
+
+generate_users_json.py generates a set of users for the Retail Demo Store.
+
+These user profiles are used in the following ways:
+
+* The Users service provides login services to the user profiles that this crates for the Retail Demo Store
+* Workshops which need to generate simulated user behavior data can use the datagenerator library to create simulated events for these user profiles after they are created.  This provides realistic and consistent data across all integrated tools in the Retail Demo Store.
+
+## datagenerator Library
+
+The datagenerator library is a Python library that provides the follwing functions:
+
+* A pool of randomly generated users (see ./datagenerator/users.py)
+* The ability to specify a set of user behavior funnels and to then generate events that can be sent to Amazon Personalize, Segment, or Amplitude.  (see ./datagenerator/file.py, amplitude.py, and segment.py)
+
+For examples of usage of the event generator features, see ../workshop/3-Experimentation/3.5-Amplitude-Performance-Metrics.ipynb)
diff --git a/generators/datagenerator/__init__.py b/generators/datagenerator/__init__.py
@@ -0,0 +1,4 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+aws_datagenerator_version = '1.8.0'
diff --git a/generators/datagenerator/amplitude.py b/generators/datagenerator/amplitude.py
@@ -0,0 +1,80 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+import datagenerator
+import json
+import requests
+import yaml
+
+# Amplitude event support
+# This follows the Amplitude V2 HTTP Bulk API spec, here:
+# https://help.amplitude.com/hc/en-us/articles/360032842391-HTTP-API-V2
+#
+# These classes accept a user, platform, and general event properties and map them 
+# into an Amplitude API compatible represenation.
+
+class AmplitudeEvent:
+  def __init__(self, timestamp, user, platform):
+    self.time = int(timestamp.timestamp() * 1000) # Amplitude time is milliseconds since epoch
+    self.user_id = f'{user.id:0>5}'  # Amplitude user ID is a string type, min length is 5 which is weird
+
+    platform_data = user.get_platform_data(platform)
+    self.device_id = platform_data['anonymous_id']
+    if platform == 'ios':
+        self.idfa = platform_data['advertising_id']
+        self.platform = 'iOS'
+        self.device_model = platform_data['model']
+        self.os_version = platform_data['version']
+    elif platform == 'android':
+        self.adid = platform_data['advertising_id']
+        self.device_model = platform_data['model']
+        self.os_version = platform_data['version']
+
+  def toJson(self):
+        return self.__repr__()
+
+  def __repr__(self):
+    return json.dumps(self.__dict__)
+
+class AmplitudeIdentifyEvent(AmplitudeEvent):
+  def __init__(self, timestamp, user, platform):
+    super().__init__(timestamp, user, platform)
+    self.event_type = '$identify'
+    self.user_properties = user.traits
+    self.user_properties['name'] = user.name
+    self.user_properties['email'] = user.email
+    self.user_properties['age'] = user.age
+    self.user_properties['gender'] = user.gender
+    self.user_properties['persona'] = user.persona
+    self.user_properties['username'] = user.username
+
+class AmplitudeTrackEvent(AmplitudeEvent):
+  def __init__(self, name, timestamp, user, platform, properties):
+    super().__init__(timestamp, user, platform)
+    self.event_type = name
+    self.event_properties = properties
+
+class AmplitudeSender:
+  def __init__(self, config):
+    self.config = config # MUST BE:  { 'api_key': <Amplitude API Key> }
+    self.endpoint = 'https://api.amplitude.com/2/httpapi'
+
+  def send_batch(self, platform, events, debug=False):
+    batch_events = {
+      "api_key": self.config['api_key'],
+      "events": events
+    }
+
+    events_str = json.dumps(batch_events, default=lambda x: x.__dict__) 
+    #print(f'Batch length bytes: {len(events_str)}')
+    if debug:
+      parsed = json.loads(events_str)
+      print(f'{json.dumps(parsed, indent=4)}')
+      response = None
+    else:
+      response = requests.post(self.endpoint, 
+        data=events_str)
+      #print(self.config_keys[platform])
+      #print(json.dumps(batch_events, default=lambda x: x.__dict__))
+      #print(f'Sent {len(batch_events["batch"])} events and got {response}')
+    return response
diff --git a/generators/datagenerator/file.py b/generators/datagenerator/file.py
@@ -0,0 +1,25 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+class FileEvent:
+  def __init__(self, name, timestamp, user, platform, properties):
+    self.event = name
+    self.timestamp = timestamp.isoformat()
+    self.user_id = user.id
+    self.anonymous_id = user.get_platform_data(platform)['anonymous_id']
+    self.platform = platform
+    self.traits = ''
+
+    if len(user.traits.items()) > 0:
+      for (k,v) in user.traits.items():
+        self.traits += f',{v}'
+
+  def str(self):
+    return self.__repr__()
+
+  def __repr__(self):
+    output = f'{self.event},{self.timestamp},{self.user_id},{self.anonymous_id},{self.platform}'
+    if len(self.traits) > 0:
+      output += self.traits
+    output += f'\n'
+    return output
diff --git a/generators/datagenerator/funnel.py b/generators/datagenerator/funnel.py
@@ -0,0 +1,68 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+import random
+import numpy as np
+import datetime
+import inspect
+from datagenerator.output import OutputFormatter
+from collections.abc import Mapping, Iterable
+
+class Funnel:
+  def __init__(self, timestamp, funnel, user):
+    self.funnel = funnel
+    self.event_index = 0
+    self.timestamp = timestamp
+    self.platform = self.funnel['platform']
+    self.user = user
+
+    if 'user_props' in self.funnel:
+      self.user.set_traits(self.funnel['user_props'])
+      self.identify = True
+    else:
+      self.identify = False
+
+    if 'state' in self.funnel:
+      self.state = self.funnel['state'](self.user)  # Passes the user to the state lambda
+    else:
+      self.state = None
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    success_percent = min(100, 50 + (self.event_index * 10)) / 100
+    proceed = self.proceed(success_percent)
+    at_start = self.event_index == 0
+    not_at_end = self.event_index < len(self.funnel['templates'])
+    # This is to make sure that you always get at least the first event in a funnel,
+    # rest will be stochastic
+    if (proceed and not_at_end) or at_start:
+      formatter = OutputFormatter(
+        self.timestamp, 
+        self.user,
+        self.platform,
+        self.generate_props(self.event_index),
+        self.funnel['templates'][self.event_index][0])
+      self.timestamp += datetime.timedelta(seconds=random.randint(30, 600))
+      self.event_index += 1
+      return formatter
+    else:
+        raise StopIteration
+
+  def generate_props(self, index):
+    template = self.funnel['templates'][index]
+    props = {}
+    for (k,v) in template[1].items():
+      if k == 'expand' and callable(v):
+        props = {**props, **v(self.state)}
+      elif callable(v):
+        props[k] = v(self.state)
+      elif isinstance(v, Iterable):
+        props[k] = random.choice(v)
+      else:
+        props[k] = v
+    return props  
+
+  def proceed(self, p):
+    return np.random.binomial(1, p)
diff --git a/generators/datagenerator/output.py b/generators/datagenerator/output.py
@@ -0,0 +1,85 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+from datagenerator.segment import SegmentIdentifyEvent, SegmentTrackEvent, SegmentSender
+from datagenerator.amplitude import AmplitudeIdentifyEvent, AmplitudeTrackEvent, AmplitudeSender
+from datagenerator.file import FileEvent
+
+# TODO: Add Personalize output file formatter
+# TODO: Add Amplitude output formatter
+
+class OutputFormatter:
+  def __init__(self, timestamp, user, platform, properties, name = None):
+    self.event = name
+    self.timestamp = timestamp
+    self.user = user
+    self.properties = properties
+    self.platform = platform
+
+  def amplitude_identify(self):
+    return AmplitudeIdentifyEvent(self.timestamp, self.user, self.platform)
+
+  def amplitude_event(self):
+    return AmplitudeTrackEvent(self.event, self.timestamp, self.user, self.platform, self.properties)
+
+  def segment_track(self):
+    return SegmentTrackEvent(self.event, self.timestamp, self.user, self.platform, self.properties)
+
+  def segment_identify(self):
+    return SegmentIdentifyEvent(self.timestamp, self.user, self.platform)
+
+  def file_event(self):
+    return FileEvent(self.event, self.timestamp, self.user, self.platform, self.properties)
+
+class OutputWriter:
+  def __init__(self, sessions):
+    self.sessions = sessions
+
+  def to_file(self, file_name):
+    # Write to the specified file using the FileEvent output formatter
+    f = open(file_name, 'w')
+    for funnel in self.sessions:
+      for formatter in funnel:
+        event = formatter.file_event()
+        f.write(event.str())
+
+  def to_amplitude(self, config, debug=False):
+    sender = AmplitudeSender(config)
+    print(f'Send config is: {config}.')
+    count = 0
+    for funnel in self.sessions:
+      batch =[]
+      count += 1
+      for formatter in funnel:
+        if funnel.identify:
+         # Send an identify call if specified in the funnel
+          event = formatter.amplitude_identify()
+          batch.append(event)
+        event = formatter.amplitude_event()
+        batch.append(event)
+      if len(batch) > 0:
+        response = sender.send_batch(funnel.platform, batch, debug)
+        if response != None and response.status_code > 200:
+          print(f'Error sending to Amplitude: {response.text}')
+    print(f'Processed {count} funnels...') 
+
+  def to_segment(self, config_file, debug=False):
+    # Write to Segment, using the specified config file
+    sender = SegmentSender('segment_config.yaml')
+    print(f'Send config is: {sender.config_keys}')
+    count = 0
+    for funnel in self.sessions:
+      batch = []
+      count += 1
+      for formatter in funnel:
+        if funnel.identify:
+          # Send an identify call if specified in the funnel
+          event = formatter.segment_identify()
+          batch.append(event)
+        event = formatter.segment_track()
+        batch.append(event)
+      if len(batch) > 0:
+        response = sender.send_batch(funnel.platform, batch, debug)
+        if response != None and response.status_code > 200:
+          print(f'Error sending to Segment: {response.text}')
+    print(f'Processed {count} funnels...')
diff --git a/generators/datagenerator/rdscatalog.py b/generators/datagenerator/rdscatalog.py
@@ -0,0 +1,14 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+import yaml
+from collections import UserList
+
+class RDSCatalog(UserList):
+  def __init__(self, file):
+    self.data = []
+    f = open(file)
+    self.data = yaml.load(f, Loader=yaml.FullLoader)
+
+  def subcategory_sample(self, categories):
+    return list(filter(lambda item: item['category'] in categories, self.data))
diff --git a/generators/datagenerator/rdsuserstate.py b/generators/datagenerator/rdsuserstate.py
@@ -0,0 +1,75 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+
+import random
+import uuid
+
+class RDSUserSelectionState:
+  def __init__(self, catalog, user):
+    if user.persona != '':  # Added to support RDS personas from the catalog
+      self.search_results = catalog.subcategory_sample(user.persona.split('_'))
+    else:
+      self.search_results = random.sample(catalog, 10)
+    self.subsample = random.sample(self.search_results, 5)
+    self.cart = random.sample(self.subsample, 3)
+    self.cart_id = str(uuid.uuid4())
+    self.search_terms = []
+    for item in self.search_results:
+      self.search_terms.extend(item['name'].split(' '))
+
+  def search(self):
+    return self.search_results
+
+  def user_search(self):
+    separator = ' '
+    query = separator.join(random.sample(self.search_terms, 2))
+    return query
+
+  def recommendations(self):
+    return random.sample(self.subsample, 3)
+
+  def cart_items(self):
+    return self.cart
+
+  def num_results(self):
+    return len(self.search_results)
+
+  def cart_value(self):
+    total = 0.0
+    for item in self.cart:
+      total += item['price']
+    return total
+
+  def item(self):
+    return random.choice(self.cart)
+
+  # These are specific to RDS event properties
+  def item_added_event_props(self):
+    item = self.item()
+    return {
+      'productId': item['id'],
+      'cartId': self.cart_id,
+      'name': item['name'],
+      'category': item['category'],
+      'image': item['image'],
+      'price': item['price'],
+      'quantity': 1
+    }
+
+  def item_viewed_event_props(self):
+    item = self.item()
+    return {
+      'productId': item['id'],
+      'name': item['name'],
+      'category': item['category'],
+      'image': item['image'],
+      'price': item['price']
+    }
+
+  def cart_viewed_event_props(self):
+    return {
+      'cartId': self.cart_id,
+      'cartSubTotal': self.cart_value(),
+      'cartTotal': self.cart_value(),
+      'cartQuantity': len(self.cart)
+    }