Skip to content

Commit

Permalink
Paris dataset, Paris preprocessing and first preprocessing tool, pari…
Browse files Browse the repository at this point in the history
…sh quickstatement
  • Loading branch information
unknown committed Nov 24, 2019
1 parent 744965a commit 0fe788c
Show file tree
Hide file tree
Showing 4 changed files with 7,669 additions and 0 deletions.
274 changes: 274 additions & 0 deletions France/Paris/Paris data collection.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The goal of these modules is to streamline the checking of water fountains as items in Wikidata. The input should be a pandas dataframe that includes the structure: 'X' = longitude, 'Y' = latitude, 'name' = unique name of item"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"importing Jupyter notebook from pandas2quickstatements.ipynb\n",
"191124_192347\n",
"Python v 3.6.5\n"
]
}
],
"source": [
"import pandas as pd\n",
"import import_ipynb\n",
"import warnings; warnings.simplefilter('ignore')\n",
"from pandas2quickstatements import write_query, identify_nearest_fountains"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" ID Geo Point A_BOIRE\n",
"0 1032.0 48.8455274608,2.25550660889 1\n",
"1 1019.0 48.8381521871,2.25870518213 0\n",
"2 1036.0 48.8572651165,2.26686423645 1\n",
"3 986.0 48.8530939586,2.2705187223 1\n",
"4 856.0 48.8409169077,2.27577864693 1\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>Geo Point</th>\n",
" <th>A_BOIRE</th>\n",
" <th>Y</th>\n",
" <th>X</th>\n",
" <th>nearest_qid</th>\n",
" <th>nearest_has_label_de</th>\n",
" <th>nearest_has_date</th>\n",
" <th>nearest_has_operator</th>\n",
" <th>nearest_has_code</th>\n",
" <th>nearest_has_water_type</th>\n",
" <th>match_found</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1032.0</td>\n",
" <td>48.8455274608,2.25550660889</td>\n",
" <td>1</td>\n",
" <td>48.845527</td>\n",
" <td>2.255507</td>\n",
" <td>Q3076303</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>no match</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1019.0</td>\n",
" <td>48.8381521871,2.25870518213</td>\n",
" <td>0</td>\n",
" <td>48.838152</td>\n",
" <td>2.258705</td>\n",
" <td>Q3076303</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>no match</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1036.0</td>\n",
" <td>48.8572651165,2.26686423645</td>\n",
" <td>1</td>\n",
" <td>48.857265</td>\n",
" <td>2.266864</td>\n",
" <td>Q3076299</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>no match</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>986.0</td>\n",
" <td>48.8530939586,2.2705187223</td>\n",
" <td>1</td>\n",
" <td>48.853094</td>\n",
" <td>2.270519</td>\n",
" <td>Q3076299</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>no match</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>856.0</td>\n",
" <td>48.8409169077,2.27577864693</td>\n",
" <td>1</td>\n",
" <td>48.840917</td>\n",
" <td>2.275779</td>\n",
" <td>Q23459533</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>no match</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID Geo Point A_BOIRE Y X \\\n",
"0 1032.0 48.8455274608,2.25550660889 1 48.845527 2.255507 \n",
"1 1019.0 48.8381521871,2.25870518213 0 48.838152 2.258705 \n",
"2 1036.0 48.8572651165,2.26686423645 1 48.857265 2.266864 \n",
"3 986.0 48.8530939586,2.2705187223 1 48.853094 2.270519 \n",
"4 856.0 48.8409169077,2.27577864693 1 48.840917 2.275779 \n",
"\n",
" nearest_qid nearest_has_label_de nearest_has_date nearest_has_operator \\\n",
"0 Q3076303 False False False \n",
"1 Q3076303 False False False \n",
"2 Q3076299 False False False \n",
"3 Q3076299 False False False \n",
"4 Q23459533 False False False \n",
"\n",
" nearest_has_code nearest_has_water_type match_found \n",
"0 False False no match \n",
"1 False False no match \n",
"2 False False no match \n",
"3 False False no match \n",
"4 False False no match "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Take the fountain data from csv file, convert to pandas\n",
"df = pd.read_csv(\"fontaines-a-boire.csv\", sep = \";\")\n",
"location = \"Paris\"\n",
"\n",
"paris_fountain_data = df[['ID', 'Geo Point', 'A_BOIRE']]\n",
"\n",
"print(paris_fountain_data.head())\n",
"\n",
"#Split the geolocation into X and Y coordinates\n",
"geo_split = paris_fountain_data['Geo Point'].str.split(\",\", n = 1, expand = True)\n",
"paris_fountain_data['Y'] = geo_split[0].apply(lambda x: float(x))\n",
"paris_fountain_data['X'] = geo_split[1].apply(lambda x: float(x))\n",
"paris_fountain_data.drop(columns = 'Geo Point')\n",
"\n",
"paris_fountain_data = identify_nearest_fountains(paris_fountain_data, location)\n",
"\n",
"paris_fountain_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"unclear match\n",
"ID 296\n",
"Geo Point 48.8673321136,2.31791349622\n",
"A_BOIRE 0\n",
"Y 48.8673\n",
"X 2.31791\n",
"nearest_qid Q3076190\n",
"nearest_has_label_de False\n",
"nearest_has_date True\n",
"nearest_has_operator False\n",
"nearest_has_code False\n",
"nearest_has_water_type False\n",
"match_found unclear\n",
"Name: 1094, dtype: object\n",
"wrote 'quickstatement_commands_Paris_drink_191124_192508.txt' with 5582 lines\n"
]
}
],
"source": [
"write_query(paris_fountain_data, location)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 0fe788c

Please sign in to comment.