Merge pull request #52 from Azure-Samples/hdinsightkafka-databricks-s…

…qldw Added Event Hubs Kafka, HDinsight Kafka, Data explorer, SQL DW
Azure-Samples · Sep 11, 2019 · 78a03e3 · 78a03e3
2 parents 3a51c10 + c89a44d
commit 78a03e3
Show file tree

Hide file tree

Showing 68 changed files with 2,647 additions and 678 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ products:
   - azure-stream-analytics
   - azure-storage
 statusNotificationTargets:
-  - damauri@microsoft.com
+  - algattik@microsoft.com
 description: "How to setup an end-to-end solution to implement a streaming at scale scenario using a choice of different Azure technologies."
 ---
 
@@ -113,6 +113,14 @@ Implement a stream processing architecture using:
 - Azure Databricks (Stream Process)
 - Cosmos DB (Serve)
 
+### [Event Hubs Kafka + Azure Databricks + Cosmos DB](eventhubskafka-databricks-cosmosdb)
+
+Implement a stream processing architecture using:
+
+- Event Hubs (Ingest / Immutable Log) with Kafka endpoint
+- Azure Databricks (Stream Process)
+- Cosmos DB (Serve)
+
 ### [Event Hubs + Azure Databricks + Delta](eventhubs-databricks-delta)
 
 Implement a stream processing architecture using:
@@ -161,12 +169,27 @@ Implement a stream processing architecture using:
 - Stream Analytics (Stream Process)
 - Event Hubs (Serve)
 
+### [HDInsight Kafka + Azure Databricks + Azure SQL](hdinsightkafka-databricks-sqldw)
+
+Implement a stream processing architecture using:
+
+- HDInsight Kafka (Ingest / Immutable Log)
+- Azure Databricks (Stream Process)
+- Azure SQL Data Warehouse (Serve)
+
+### [Event Hubs + Azure Data Explorer](eventhubs-dataexplorer)
+
+Implement a stream processing architecture using:
+
+- Event Hubs (Ingest / Immutable Log)
+- Azure Data Explorer (Stream Process / Serve)
+
 ### [Event Hubs + Data Accelerator + Cosmos DB](eventhubs-dataaccelerator-cosmosdb)
 
 Implement a stream processing architecture using:
 
 - Event Hubs (Ingest / Immutable Log)
-- Microsft Data Accelerator on HDInsight and Service Fabric (Stream Process)
+- Microsoft Data Accelerator on HDInsight and Service Fabric (Stream Process)
 - Cosmos DB (Serve)
 
 ## Note
@@ -180,15 +203,13 @@ The following technologies could also be used in the end-to-end sample solution.
 ### Ingestion
 
 - IoT Hub
-- EventHub Kafka
 
 ### Stream Processing
 
 - Azure Data Explorer
 
 ### Batch Processing
 
-- Databricks Spark
 - Azure Data Explorer
 
 ### Serving Layer

diff --git a/_bootstrap/README.md b/_bootstrap/README.md
@@ -63,11 +63,11 @@ You can take a look at the following script to understand how you can create you
 
 once you have created your file, make sure you rename it so that it will be clear which technology is using.
 
-### source ../simulator/run-event-generator.sh
+### source ../simulator/run-generator-eventhubs.sh
 
-`run-event-generator.sh` contains the code need to setup a [Locust](http://locust.io) cluster in distributed mode, usiong Azure Container Instances.
+`run-generator-eventhubs.sh` contains the code need to setup Spark clients, using Azure Container Instances.
 
-Each locust generates up to 340 msgs/sec. Each generated message is close to 1KB and look like this:
+Each client generates up to 2000 msgs/sec. Each generated message is close to 1KB and look like this:
 
 ```json
 {
@@ -104,7 +104,7 @@ Each locust generates up to 340 msgs/sec. Each generated message is close to 1KB
 }
 ```
 
-and it will send data to the specified Event Hub. If you need to send data to something different, then you will need to create a new locustfile in [../simulator/simulator.py](../simulator/simulator.py) and also make sure it is uploaded to the shared file folder (check the code in the script).
+and it will send data to the specified Event Hub.
 
 ### source ../components/azure-event-hubs/report-throughput.sh
 

diff --git a/_bootstrap/create-solution.sh b/_bootstrap/create-solution.sh
@@ -66,17 +66,17 @@ if [ "$TESTTYPE" == "10" ]; then
     export PROC_JOB_NAME=streamingjob
     export PROC_STREAMING_UNITS=36 # must be 1, 3, 6 or a multiple or 6
     export COSMOSDB_RU=100000
-    export TEST_CLIENTS=30
+    export SIMULATOR_INSTANCES=5
 fi
 
-# 5500 messages/sec
+# 5000 messages/sec
 if [ "$TESTTYPE" == "5" ]; then
     export EVENTHUB_PARTITIONS=8
     export EVENTHUB_CAPACITY=6
     export PROC_JOB_NAME=streamingjob
     export PROC_STREAMING_UNITS=24 # must be 1, 3, 6 or a multiple or 6
     export COSMOSDB_RU=60000
-    export TEST_CLIENTS=16
+    export SIMULATOR_INSTANCES=3
 fi
 
 # 1000 messages/sec
@@ -86,13 +86,13 @@ if [ "$TESTTYPE" == "1" ]; then
     export PROC_JOB_NAME=streamingjob
     export PROC_STREAMING_UNITS=6 # must be 1, 3, 6 or a multiple or 6
     export COSMOSDB_RU=20000
-    export TEST_CLIENTS=3 
+    export SIMULATOR_INSTANCES=1 
 fi
 
 # ---- END: SET THE VALUES TO CORRECTLY HANDLE THE WORLOAD
 
 # last checks and variables setup
-if [ -z ${TEST_CLIENTS+x} ]; then
+if [ -z ${SIMULATOR_INSTANCES+x} ]; then
     usage
 fi
 
@@ -120,7 +120,7 @@ echo ". Region          => $LOCATION"
 echo ". EventHubs       => TU: $EVENTHUB_CAPACITY, Partitions: $EVENTHUB_PARTITIONS"
 echo ". StreamAnalytics => Name: $PROC_JOB_NAME, SU: $PROC_STREAMING_UNITS"
 echo ". CosmosDB        => RU: $COSMOSDB_RU"
-echo ". Locusts         => $TEST_CLIENTS"
+echo ". Simulators      => $SIMULATOR_INSTANCES"
 echo
 
 echo "Deployment started..."
@@ -186,7 +186,7 @@ echo "***** [T] Starting up TEST clients"
 
     RUN=`echo $STEPS | grep T -o || true`
     if [ ! -z "$RUN" ]; then
-        source ../simulator/run-event-generator.sh
+        source ../simulator/run-generator-eventhubs.sh
     fi
 echo
 

diff --git a/components/azure-common/create-virtual-network.sh b/components/azure-common/create-virtual-network.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Strict mode, fail on any error
+set -euo pipefail
+
+echo 'creating virtual network'
+echo ". name: $VNET_NAME"
+
+az group create -n $RESOURCE_GROUP -l $LOCATION --tags streaming_at_scale_generated=1 \
+-o tsv >> log.txt
+
+if ! az network vnet show -n $VNET_NAME -g $RESOURCE_GROUP -o none 2>/dev/null; then
+  az network vnet create -n $VNET_NAME -g $RESOURCE_GROUP \
+    --address-prefix 10.0.0.0/16 \
+    -o tsv >> log.txt
+fi
+
+az network vnet subnet create -g $RESOURCE_GROUP --vnet-name $VNET_NAME \
+  -n producers-subnet --address-prefixes 10.0.0.0/24 \
+  --delegations Microsoft.ContainerInstance/containerGroups \
+  -o tsv >> log.txt
+
+az network vnet subnet create -g $RESOURCE_GROUP --vnet-name $VNET_NAME \
+  -n ingestion-subnet --address-prefixes 10.0.1.0/24 \
+  -o tsv >> log.txt
+
+az network vnet subnet create -g $RESOURCE_GROUP --vnet-name $VNET_NAME \
+  -n streaming-subnet --address-prefixes 10.0.2.0/24 \
+  -o tsv >> log.txt
diff --git a/components/azure-databricks/create-databricks.sh b/components/azure-databricks/create-databricks.sh
@@ -3,9 +3,13 @@
 # Strict mode, fail on any error
 set -euo pipefail
 
-if [[ -n "${DATABRICKS_HOST:-}" && -n "${DATABRICKS_TOKEN:-}" ]]; then
+if [ -n "${DATABRICKS_TOKEN:-}" ]; then
 
-  echo 'Not creating Databricks workspace. Using environment DATABRICKS_HOST and DATABRICKS_TOKEN settings'
+  echo 'Not creating Databricks workspace. Using environment DATABRICKS_TOKEN setting'
+
+  if [ -z "${DATABRICKS_HOST:-}" ]; then
+    export DATABRICKS_HOST="https://$LOCATION.azuredatabricks.net"
+  fi
 
 else
 
@@ -18,7 +22,7 @@ az group deployment create \
   --template-file ../components/azure-databricks/databricks-arm-template.json \
   --parameters \
   workspaceName=$ADB_WORKSPACE \
-  tier=standard \
+  pricingTier=standard \
   -o tsv >>log.txt
 fi