Skip to content

Commit

Permalink
feat(data): add notebook to check on tooo_many_trips flagued carpools (
Browse files Browse the repository at this point in the history
  • Loading branch information
P3rceval authored Jan 16, 2025
1 parent 9e37c21 commit efb4e07
Showing 1 changed file with 268 additions and 0 deletions.
268 changes: 268 additions & 0 deletions notebooks/analytics/fraudcheck/too_many_trips_flagued.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Parameters\n",
"- `connection_string` : 'postgresql://postgres:postgres@localhost:5432/local' -> Postgresql URL connection string"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"from sqlalchemy import create_engine, text\n",
"\n",
"df_carpools = pd.read_csv('impacted_carpools.csv')\n",
"connection_string = os.environ['PG_CONNECTION_STRING']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_combined = pd.concat([df_carpools['_id'], df_carpools['suspicious_carpool_id']], ignore_index=True).drop_duplicates()\n",
"id_values = ', '.join(map(str, df_combined))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"engine = create_engine(connection_string, connect_args={'sslmode':'require'})\n",
"\n",
"query = f\"\"\"(\n",
" SELECT\n",
" CC._ID,\n",
" CASE\n",
" WHEN DRIVER_PHONE IS NOT NULL THEN SUBSTR(DRIVER_PHONE, 1, 10)\n",
" ELSE CC.DRIVER_PHONE_TRUNC\n",
" END AS PHONE_TRUNC,\n",
" CC.DRIVER_IDENTITY_KEY AS IDENTITY_KEY,\n",
" CC.DRIVER_OPERATOR_USER_ID AS OPERATOR_USER_ID,\n",
" CC.OPERATOR_TRIP_ID,\n",
" CC.START_DATETIME,\n",
" EXTRACT(\n",
" EPOCH\n",
" FROM\n",
" (CC.END_DATETIME - CC.START_DATETIME)\n",
" )::INT AS DURATION,\n",
" CC.END_DATETIME,\n",
" CC.OPERATOR_ID,\n",
" TRUE AS IS_DRIVER,\n",
" CASE\n",
" WHEN PASSENGER_PHONE IS NOT NULL THEN SUBSTR(PASSENGER_PHONE, 1, 10)\n",
" ELSE CC.PASSENGER_PHONE_TRUNC\n",
" END AS OTHER_PHONE_TRUNC,\n",
" CC.PASSENGER_IDENTITY_KEY AS OTHER_IDENTITY_KEY,\n",
" CC.PASSENGER_OPERATOR_USER_ID AS OTHER_OPERATOR_USER_ID\n",
" FROM\n",
" CARPOOL_V2.CARPOOLS CC\n",
" WHERE\n",
" CC._id IN ({id_values})\n",
")\n",
"UNION ALL\n",
"(\n",
" SELECT\n",
" CC._ID,\n",
" CASE\n",
" WHEN PASSENGER_PHONE IS NOT NULL THEN SUBSTR(PASSENGER_PHONE, 1, 10)\n",
" ELSE CC.PASSENGER_PHONE_TRUNC\n",
" END AS PHONE_TRUNC,\n",
" CC.PASSENGER_IDENTITY_KEY AS IDENTITY_KEY,\n",
" CC.OPERATOR_TRIP_ID,\n",
" CC.PASSENGER_OPERATOR_USER_ID AS OPERATOR_USER_ID,\n",
" CC.START_DATETIME,\n",
" EXTRACT(\n",
" EPOCH\n",
" FROM\n",
" (CC.END_DATETIME - CC.START_DATETIME)\n",
" )::INT AS DURATION,\n",
" CC.END_DATETIME,\n",
" CC.OPERATOR_ID,\n",
" FALSE::BOOLEAN AS IS_DRIVER,\n",
" CASE\n",
" WHEN DRIVER_PHONE IS NOT NULL THEN SUBSTR(DRIVER_PHONE, 1, 10)\n",
" ELSE CC.DRIVER_PHONE_TRUNC\n",
" END AS OTHER_PHONE_TRUNC,\n",
" CC.DRIVER_IDENTITY_KEY AS OTHER_IDENTITY_KEY,\n",
" CC.DRIVER_OPERATOR_USER_ID AS OTHER_OPERATOR_USER_ID\n",
" FROM\n",
" CARPOOL_V2.CARPOOLS CC\n",
" WHERE\n",
" CC._id IN ({id_values})\n",
")\n",
"\"\"\"\n",
"\n",
"with engine.connect() as conn:\n",
" df_carpool = pd.read_sql_query(text(query), conn)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_carpool['date'] = df_carpool['start_datetime'].dt.date"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_carpool_agg = df_carpool.groupby(['identity_key', 'date']).agg(\n",
" unique_operator_trip_id=('operator_trip_id', 'nunique'),\n",
" unique_operator_id=('operator_id', 'nunique'),\n",
" carpool_id_list=('_id', list)).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"multi_op_mask = df_carpool_agg['unique_operator_id'] > 1\n",
"more_than_four_trip_mask = df_carpool_agg['unique_operator_trip_id'] > 4\n",
"\n",
"df_carpool_target = df_carpool_agg[(multi_op_mask) & (more_than_four_trip_mask)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_carpool_target = df_carpool_target.explode('carpool_id_list').rename(columns={'carpool_id_list': '_id'})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_carpool_target_merged = pd.merge(df_carpool_target, df_carpool, on=['_id', 'identity_key'], how = 'left').sort_values(by=['identity_key', 'start_datetime'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_every_trip_after_4_trip_id_occ(df):\n",
" df_grouped_by_trip = df.groupby('operator_trip_id').agg(\n",
" start_datetime=('start_datetime', 'first'),\n",
" _id=('_id', 'first')\n",
").reset_index().sort_values(by=['start_datetime'])\n",
" return df_grouped_by_trip.iloc[4:]\n",
"\n",
"df_final_result = df_carpool_target_merged.groupby(['identity_key']).apply(lambda x: get_every_trip_after_4_trip_id_occ(x))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_wrong_flagued_id = df_carpools.copy()\n",
"df_wrong_flagued_id.drop_duplicates(subset='suspicious_carpool_id', inplace=True)\n",
"df_wrong_flagued_id.drop(columns=['_id'], inplace=True)\n",
"df_wrong_flagued_id.rename(columns={'suspicious_carpool_id': '_id'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_final_result_id = df_final_result.copy() \n",
"df_final_result_id"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_wrong_flagued_verified = df_wrong_flagued_id[~df_wrong_flagued_id['_id'].isin(df_final_result_id['_id'])]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"engine = create_engine(connection_string, connect_args={'sslmode':'require'})\n",
"\n",
"query = f\"\"\"(\n",
" SELECT\n",
" CC._ID,\n",
" CC.operator_journey_id,\n",
" CC.operator_id,\n",
" FL.*\n",
" FROM\n",
" CARPOOL_V2.CARPOOLS CC\n",
" JOIN\n",
" fraudcheck.labels fl on fl.carpool_id = CC._id \n",
" WHERE\n",
" CC._id IN ({', '.join(map(str, df_wrong_flagued_verified['_id']))}) and fl.label = 'interoperator_too_many_trips_by_day'\n",
")\n",
"\"\"\"\n",
"\n",
"with engine.connect() as conn:\n",
" df_unwanted_flagued_carpools = pd.read_sql_query(text(query), conn)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_unwanted_flagued_carpools.to_csv('unwanted_flagued_carpools.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit efb4e07

Please sign in to comment.