Merge pull request #269 from chrisr3d/main

Doping substances taxonomy
MISP · Oct 19, 2023 · 8be1cf5 · 8be1cf5
2 parents 84fd295 + 65e8a70
commit 8be1cf5
Show file tree

Hide file tree

Showing 5 changed files with 1,240 additions and 0 deletions.
diff --git a/MANIFEST.json b/MANIFEST.json
@@ -742,6 +742,11 @@
       "description": "Workflow support language is a common language to support intelligence analysts to perform their analysis on data and information.",
       "name": "workflow",
       "version": 11
+    },
+    {
+      "description": "This taxonomy aims to list doping substances",
+      "name": "doping-substances",
+      "version": 2
     }
   ],
   "url": "https://raw.githubusercontent.com/MISP/misp-taxonomies/main/",

diff --git a/doping-substances/Misp-logo.png b/doping-substances/Misp-logo.png
diff --git a/doping-substances/README.md b/doping-substances/README.md
@@ -0,0 +1,44 @@
+# MISP_DopingSubstanceTaxonomy
+
+This project aims to gather information about all the prohibited sports Doping Substances. 
+
+We collected all of the information on the [WADA website](https://www.wada-ama.org/en/prohibited-list).
+
+To do that we have created a python script to scrap this website and generate a JSON file (Taxonomy).
+
+This Taxonomy could be add in MISP to help sports organizations to fight against usage of doping substances.
+
+## MISP
+
+![logo](Misp-logo.png)
+
+What is MISP ?
+
+>A threat intelligence platform for sharing, storing and correlating 
+Indicators of Compromise of targeted attacks, threat intelligence, 
+financial fraud information, vulnerability information or even 
+counter-terrorism information. Discover how MISP is used today in 
+multiple organisations. Not only to store, share, collaborate on cyber 
+security indicators, malware analysis, but also to use the IoCs and 
+information to detect and prevent attacks, frauds or threats against ICT
+ infrastructures, organisations or people.
+
+## JSON Generation
+
+In order to build the JSON file, we created a Python script which scrap the WADA (World Anti-Doping Agency) ‘s prohibited list.
+
+Thanks to BeautifulSoup, a useful library that helps a lot when it comes to scrap HTLM documents, the script is able to get all the list of doping substances.
+
+The file is created with PyTaxonomies, a MISP library that help to create valid JSON file according to the [MISP Platform](https://www.misp-project.org/taxonomies.html#_misp_taxonomies).
+
+Finally, the script generates all predicates (doping categories) and the entries associated (the doping substances themselves).
+
+## Installation
+
+If you want to try it out yourself, you need to have both BeautifulSoup & PyTaxonomies installated.
+
+## Authors
+
+DELUS Thibaut : https://github.com/WooZyhh
+
+JACOB Lucas : https://github.com/Chaamoxs
diff --git a/doping-substances/gen_taxonomy.py b/doping-substances/gen_taxonomy.py
@@ -0,0 +1,63 @@
+import json
+import requests
+from bs4 import BeautifulSoup
+from pathlib import Path
+from pytaxonomies import Entry, Predicate, Taxonomy
+
+CONTENT_URL = 'https://www.wada-ama.org/en/prohibited-list'
+
+TAXONOMY_DESCRIPTION = 'This taxonomy aims to list doping substances'
+TAXONOMY_EXPANDED = 'Doping substances'
+TAXONOMY_NAME = 'doping-substances'
+
+ignore = ('NON-APPROVED SUBSTANCES', )
+
+
+def list_predicates(articles):
+    predicates = {}
+    for article in articles:
+        title = article.find('p', attrs={'class': 'h3 panel-title'}).text
+        if title in ignore:
+            continue
+        predicate = Predicate()
+        predicate.predicate = title
+        div = article.find('div', attrs={'class': 'layout-wysiwyg'})
+        description = div.find('p')
+        predicate.description = description.find_next_sibling().text
+        predicates[title] = predicate
+    return predicates
+
+
+def generate_taxonomy():
+    new_taxonomy = Taxonomy()
+
+    new_taxonomy.name = TAXONOMY_NAME
+    new_taxonomy.expanded = TAXONOMY_EXPANDED
+    new_taxonomy.description = TAXONOMY_DESCRIPTION
+
+    response = requests.get(CONTENT_URL)
+    soup = BeautifulSoup(response.text, 'html.parser')
+    articles = soup.findAll('article', attrs={'class': 'panel hide-reader'})
+
+    new_taxonomy.predicates = list_predicates(articles)
+
+    for article in articles:
+        title = article.find('p', attrs={'class': 'h3 panel-title'}).text
+        if title in ignore:
+            continue
+        products = article.findAll('li')
+        products_list = {}
+        for product in products:
+            entry = Entry()
+            entry.value = product.text
+            products_list[entry.value] = entry
+        new_taxonomy.predicates[title].entries = products_list
+
+    return new_taxonomy
+
+
+if __name__ == '__main__':
+    taxonomy = generate_taxonomy()
+    taxonomy.version = 2
+    with open(Path(__file__).resolve().parent / 'machinetag.json', 'wt', encoding='utf-8') as f:
+        json.dump(taxonomy.to_dict(), f, indent=2, ensure_ascii=False)