{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "36bwUUXKndg8"
},
"source": [
"**Практическая работа №4**\n",
"\n",
"\n",
"# Обнаружение злоумышленников в системе мобильных денежных переводов\n",
"\n",
"\n",
"1) настройка окружения"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "b49QVpuYemMp",
"outputId": "925a677e-d7ba-406c-99d6-6642dc48bb82"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: scipy==1.8.1 in c:\\users\\женька\\appdata\\roaming\\python\\python310\\site-packages (1.8.1)\n",
"Requirement already satisfied: numpy<1.25.0,>=1.17.3 in c:\\programdata\\anaconda3\\lib\\site-packages (from scipy==1.8.1) (1.23.5)\n",
"Requirement already satisfied: networkx==2.7.0 in c:\\users\\женька\\appdata\\roaming\\python\\python310\\site-packages (2.7)\n",
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: pyvis in c:\\users\\женька\\appdata\\roaming\\python\\python310\\site-packages (0.3.2)\n",
"Requirement already satisfied: jinja2>=2.9.6 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyvis) (3.1.2)\n",
"Requirement already satisfied: ipython>=5.3.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from pyvis) (8.10.0)\n",
"Requirement already satisfied: networkx>=1.11 in c:\\users\\женька\\appdata\\roaming\\python\\python310\\site-packages (from pyvis) (2.7)\n",
"Requirement already satisfied: jsonpickle>=1.4.1 in c:\\users\\женька\\appdata\\roaming\\python\\python310\\site-packages (from pyvis) (3.0.1)\n",
"Requirement already satisfied: backcall in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (0.2.0)\n",
"Requirement already satisfied: jedi>=0.16 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (0.18.1)\n",
"Requirement already satisfied: matplotlib-inline in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (0.1.6)\n",
"Requirement already satisfied: pygments>=2.4.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (2.11.2)\n",
"Requirement already satisfied: pickleshare in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (0.7.5)\n",
"Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.30 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (3.0.36)\n",
"Requirement already satisfied: stack-data in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (0.2.0)\n",
"Requirement already satisfied: traitlets>=5 in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (5.7.1)\n",
"Requirement already satisfied: decorator in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (5.1.1)\n",
"Requirement already satisfied: colorama in c:\\programdata\\anaconda3\\lib\\site-packages (from ipython>=5.3.0->pyvis) (0.4.6)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from jinja2>=2.9.6->pyvis) (2.1.1)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.0 in c:\\programdata\\anaconda3\\lib\\site-packages (from jedi>=0.16->ipython>=5.3.0->pyvis) (0.8.3)\n",
"Requirement already satisfied: wcwidth in c:\\programdata\\anaconda3\\lib\\site-packages (from prompt-toolkit<3.1.0,>=3.0.30->ipython>=5.3.0->pyvis) (0.2.5)\n",
"Requirement already satisfied: executing in c:\\programdata\\anaconda3\\lib\\site-packages (from stack-data->ipython>=5.3.0->pyvis) (0.8.3)\n",
"Requirement already satisfied: pure-eval in c:\\programdata\\anaconda3\\lib\\site-packages (from stack-data->ipython>=5.3.0->pyvis) (0.2.2)\n",
"Requirement already satisfied: asttokens in c:\\programdata\\anaconda3\\lib\\site-packages (from stack-data->ipython>=5.3.0->pyvis) (2.0.5)\n",
"Requirement already satisfied: six in c:\\programdata\\anaconda3\\lib\\site-packages (from asttokens->stack-data->ipython>=5.3.0->pyvis) (1.16.0)\n"
]
}
],
"source": [
"%pip install --user scipy==1.8.1\n",
"%pip install --user networkx==2.7.0\n",
"#uncomment when running in Google Collab\n",
"#!apt install python3-dev graphviz libgraphviz-dev pkg-config\n",
"#!pip install pygraphviz\n",
"%pip install pyvis\n",
"\n",
"import zipfile\n",
"import itertools\n",
"import tempfile\n",
"import math\n",
"\n",
"from functools import reduce\n",
"from pyvis import network as net\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import networkx as nx\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
"from plotly.offline import iplot\n",
"from IPython.display import display, HTML\n",
"\n",
"\n",
"\n",
"\n",
"#for Jupiter notebooks\n",
"import plotly.io as pio #comment for Google collab\n",
"pio.renderers.default='notebook'#comment for Google collab\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import zipfile\n",
"import itertools\n",
"import tempfile\n",
"import math\n",
"\n",
"from functools import reduce\n",
"#from pyvis import network as net\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import networkx as nx\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
"from plotly.offline import iplot\n",
"from IPython.display import display, HTML\n",
"\n",
"pd.options.plotting.backend = \"plotly\"\n",
"\n",
"\n",
"\n",
"#for jupiter notebook\n",
"from plotly.offline import init_notebook_mode #for Google collab comment these two lines \n",
"init_notebook_mode(connected=True) #for Google collab comment these two lines "
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "eO8P9tVa1LU3"
},
"source": [
"# Загрузка данных \n",
"Возможно потребуется адаптировать под ваши условия."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "--dQffaaxORN",
"outputId": "395bc6de-d351-4f29-e9d6-c0db8fb9b24d"
},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'C:\\\\Practice\\\\data.zip'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m zip_filepath\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mC:\u001b[39m\u001b[39m\\\u001b[39m\u001b[39mPractice\u001b[39m\u001b[39m\\\u001b[39m\u001b[39mdata.zip\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mwith\u001b[39;00m zipfile\u001b[39m.\u001b[39;49mZipFile(zip_filepath) \u001b[39mas\u001b[39;00m z:\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m(z\u001b[39m.\u001b[39mnamelist())\n\u001b[1;32m 5\u001b[0m \u001b[39mfor\u001b[39;00m name \u001b[39min\u001b[39;00m z\u001b[39m.\u001b[39mnamelist():\n",
"File \u001b[0;32m/usr/lib64/python3.11/zipfile.py:1283\u001b[0m, in \u001b[0;36mZipFile.__init__\u001b[0;34m(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps, metadata_encoding)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 1282\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1283\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfp \u001b[39m=\u001b[39m io\u001b[39m.\u001b[39;49mopen(file, filemode)\n\u001b[1;32m 1284\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mOSError\u001b[39;00m:\n\u001b[1;32m 1285\u001b[0m \u001b[39mif\u001b[39;00m filemode \u001b[39min\u001b[39;00m modeDict:\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:\\\\Practice\\\\data.zip'"
]
}
],
"source": [
"zip_filepath='C:\\Practice\\data.zip'\n",
"\n",
"with zipfile.ZipFile(zip_filepath) as z:\n",
" print(z.namelist())\n",
" for name in z.namelist():\n",
" with open(name, 'wb') as f:\n",
" f.write(z.read(name))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "bTKfidonJBfI"
},
"outputs": [],
"source": [
"def pyvis_deepnote_show(nt):\n",
" tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name\n",
" nt.save_graph(tmp_output_filename)\n",
"\n",
" f = open(tmp_output_filename, \"r\")\n",
" display(HTML(f.read()))"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "3HDyVfm9JF2S"
},
"source": [
"# Чтение и предобработка данных \n",
"\n",
"В тестовом примере мы рассмотрим данные с метками аномалий.\n",
"Сначала преобрзуем в dataframe, уберем часть лишних колонок, которые не имеют отношения к тем финансовым мошенничествам, которые есть в данных (по условию генерации данных)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "pUWDQTUCwcbD",
"outputId": "ef666a6a-ac02-4953-eb2a-34cbe1514f89"
},
"outputs": [
{
"ename": "TypeError",
"evalue": "NDFrame.describe() got an unexpected keyword argument 'datetime_is_numeric'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 44\u001b[0m\n\u001b[1;32m 38\u001b[0m df \u001b[39m=\u001b[39m df\u001b[39m.\u001b[39mdrop(\u001b[39m'\u001b[39m\u001b[39mBalance after (receiver)\u001b[39m\u001b[39m'\u001b[39m, axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[1;32m 43\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mGroundtruth\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m df[\u001b[39m'\u001b[39m\u001b[39mGroundtruth\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m.\u001b[39mstr\u001b[39m.\u001b[39mreplace(\u001b[39m'\u001b[39m\u001b[39m-\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m_\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 44\u001b[0m df\u001b[39m.\u001b[39;49mdescribe(include\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mall\u001b[39;49m\u001b[39m'\u001b[39;49m, datetime_is_numeric\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\u001b[39m.\u001b[39mfillna(\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)\n",
"\u001b[0;31mTypeError\u001b[0m: NDFrame.describe() got an unexpected keyword argument 'datetime_is_numeric'"
]
}
],
"source": [
"df = pd.read_csv('./FinFraud_Labelled.csv', sep='|', parse_dates=[16, 17, 22])\n",
"# в файлах с вариантом задания, разделитель - \";\" \n",
"df.columns = [\n",
" 'Groundtruth', \n",
" 'User ID (sender)', \n",
" 'User ID (receiver)',\n",
" 'User account ID (sender)',\n",
" 'User account ID (receiver)',\n",
" 'Amount of transaction',\n",
" 'Type of transaction',\n",
" 'State of operation',\n",
" 'Balance before (sender)',\n",
" 'Balance after (sender)',\n",
" 'Balance after (receiver)',\n",
" 'Balance before (receiver)', \n",
" 'Not used',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Transaction timestamp (sender)',\n",
" 'Transaction timestamp (receiver)',\n",
" 'Sender account ID',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Transaction timestamp',\n",
" 'Sender type',\n",
" 'Receiver type'\n",
"]\n",
"df = df.loc[:, ~df.columns.str.contains('^Not used', case=False)].sort_values('Transaction timestamp') \n",
"df = df.drop('State of operation', axis=1)\n",
"df = df.drop('Sender account ID', axis=1)\n",
"df = df.drop('Transaction timestamp (sender)', axis=1)\n",
"df = df.drop('Transaction timestamp (receiver)', axis=1)\n",
"df = df.drop('Balance before (sender)', axis=1)\n",
"df = df.drop('Balance after (sender)', axis=1)\n",
"df = df.drop('Balance before (receiver)', axis=1)\n",
"df = df.drop('Balance after (receiver)', axis=1)\n",
"\n",
"\n",
"\n",
"\n",
"df['Groundtruth'] = df['Groundtruth'].str.replace('-', '_')\n",
"df.describe(include='all').fillna('')"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qnyhlynM_1PD"
},
"source": [
"## Описание набора данных"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "uXtlGn1K2ICL"
},
"source": [
"\n",
"| Название столбца | Возможные значения |Описание |\n",
"|----------------------------------------|----------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n",
"| Groundtruth | `N_RegC2C`
`N_RegDep`
`N_Reg_RC`
`N_RegWith`
`N_Reg_Merch`
`F_bot`
`F_Mule_With`
`F_SevWith` |`N_RegC2C` – легитимные денежные транзакции
`N_RegDep` – пополнение электронного кошелька
`N_Reg_RC` – пополнение баланса мобильной связи
`N_RegWith` – снятие денег с электронного кошелька
`N_Reg_Merch` – оплата услуг и товаров
Мошенничества:
`F_bot` – транзакции, выполняемые вредоносном ПО, установленном на телефоне, перевод выполняется подставлному лицу (\"ослу\" )\", который затем обналичивает деньги
`F_Mule_With` – снятия денежных средств подставлным лицом
`F_SevWith` – транзакции, выполняемые воров после кражи телефона |\n",
"| User ID (transaction sender) | Generated ID | |\n",
"| User ID (transaction receiver) | Generated ID | |\n",
"| User account ID (transaction sender) | Generated ID | |\n",
"| User account ID (transaction receiver) | Generated ID | |\n",
"| Amount of transaction | Number | |\n",
"| Type of transaction | `Ind`
`Dt`
`ArRC`
`Wl`
`Merchant` | Тип транзакции
`Ind` – денежный перевод между пользователями системы
`Dt` – пополнение электронного кошелька (отправитель агент, а получатель - пользователь системы)
`ArRC` – пополнение счета мобильной связи (перевод от пользователя системы к оператору мобильной связи )
`Wl` – снятие электронных денег (отправитель - пользователь системы, получатель - оператор)
`Merchant` – перевод от пользователя поставщику услуг или товаров |\n",
"| State of operation | `SU` | `SU` – успешно |\n",
"| Balance before (transaction sender) | Number | |\n",
"| Balance before (transaction receiver) | Number | |\n",
"| Balance after (transaction sender) | Number | |\n",
"| Balance after (transaction receiver) | Number | |\n",
"| Transaction timestamp (sender) | Datetime | |\n",
"| Transaction timestamp (receiver) | Datetime | |\n",
"| Sender account ID | Generated ID | |\n",
"| Transaction timestamp | Datetime | |\n",
"| Sender type | `EU`
`RET` | |\n",
"| Receiver type | `EU`
`operator`
`RET`
`MER` | |\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vA1VgIkNAVoJ"
},
"source": [
"Поскольку поле `State of operation` всегда имеет значение (`SU`) для всех транзакций, данный столбец предлагается удалить. \n",
"Столбцы `Sender account ID` и `User ID (transaction sender)` идентичны, также столбцы `Transaction timestamp (sender)` и `Transaction timestamp (receiver)` идентичны стобцу `Transaction timestamp`, поэтому данные стобцы удалются (остается только `Transaction timestamp`). Также удаляюся столбцы с балансом, т.к. в текущей версии набора данных они не задействованы.\n",
"\n",
"Значения поля `Groundtruth` преобрзованы в общий вид. Они используются только для проверки."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "XJP-UnR_AxI7",
"outputId": "65cdb2c3-e131-4553-daf1-cec6473d8406"
},
"outputs": [
{
"data": {
"text/plain": [
"Groundtruth object\n",
"User ID (sender) object\n",
"User ID (receiver) object\n",
"User account ID (sender) object\n",
"User account ID (receiver) object\n",
"Amount of transaction float64\n",
"Type of transaction object\n",
"Transaction timestamp datetime64[ns]\n",
"Sender type object\n",
"Receiver type object\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "xwBs_xG1GY-r"
},
"source": [
"#Статистика транзакций для каждого пользователя\n",
"\n",
"Традиционно начнем со статистического анализа данных. Рекомендуется расширить число рассчитываемых статистик, например, включив показатели, характеризующие частоту транзакций. Для такого вида мошенничества как кража телефона изменение частоты снятий является характерным признаком."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"id": "T6FQYWmqGhcW"
},
"outputs": [],
"source": [
"def init_stat_dict():\n",
" stat_dict = dict()\n",
" transaction_types = {\"Ind\", \"Wl\", \"Dt\", \"Merchant\", \"ArRC\"} \n",
" for tran_type in transaction_types:\n",
" amount_name = f\"Sent_amount_{tran_type}\"\n",
" amount_median = f\"Sent_amount_{tran_type}_median\"\n",
" amount_min = f\"Sent_amount_{tran_type}_min\"\n",
" amount_max = f\"Sent_amount_{tran_type}_max\"\n",
" tran_count = f\"Sent_{tran_type}_count\"\n",
" rec_amount_name = f\"Received_amount_{tran_type}\"\n",
" rec_amount_median = f\"Received_amount_{tran_type}_median\"\n",
" rec_amount_min = f\"Received_amount_{tran_type}_min\"\n",
" rec_amount_max = f\"Received_amount_{tran_type}_max\"\n",
" rec_tran_count = f\"Received_{tran_type}_count\"\n",
" \n",
" stat_dict[amount_name] = 0\n",
" stat_dict[amount_median] = 0\n",
" stat_dict[amount_min] = 0\n",
" stat_dict[amount_max] = 0\n",
" stat_dict[tran_count] = 0\n",
" stat_dict[rec_amount_name] = 0\n",
" stat_dict[rec_amount_median] = 0\n",
" stat_dict[rec_amount_min] = 0\n",
" stat_dict[rec_amount_max] = 0\n",
" stat_dict[rec_tran_count] = 0\n",
"\n",
" return stat_dict\n",
"\n",
"\n",
"def get_stat_df(df):\n",
" sent_unique_users = df[\"User ID (sender)\"].unique()\n",
" received_unique_users = df[\"User ID (receiver)\"].unique()\n",
" unique_users = np.unique(np.concatenate((sent_unique_users,received_unique_users),0))\n",
" #unique_users = pd.concat(sent_unique_users,received_unique_users).drop_duplicates().reset_index(drop=True)\n",
" print(unique_users)\n",
" stat_df = pd.DataFrame()\n",
" stat_dict = init_stat_dict()\n",
" transaction_types = {\"Ind\", \"Wl\", \"Dt\", \"Merchant\", \"ArRC\"}\n",
" for user in unique_users:\n",
" stat_dict = init_stat_dict() \n",
" stat_dict[\"User ID\"] = user\n",
"\n",
" user_df = df.loc[(df[\"User ID (sender)\"] == user)]\n",
" \n",
" if (not user_df.empty):\n",
" #stat_dict[\"User ID\"] = user\n",
" \n",
" stat_dict[\"Unique_receivers\"] = len(user_df[\"User ID (receiver)\"].unique())\n",
" stat_dict[\"User type\"] = user_df[\"Sender type\"].unique()[0]\n",
"\n",
" for tran_type in transaction_types:\n",
" amount_name = f\"Sent_amount_{tran_type}\"\n",
" amount_median = f\"Sent_amount_{tran_type}_median\"\n",
" amount_min = f\"Sent_amount_{tran_type}_min\"\n",
" amount_max = f\"Sent_amount_{tran_type}_max\"\n",
" tran_count = f\"Sent_{tran_type}_count\"\n",
" stat_dict[amount_name] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].sum()\n",
" stat_dict[amount_median] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].mean()\n",
" stat_dict[amount_min] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].min()\n",
" stat_dict[amount_max] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].max()\n",
" stat_dict[tran_count] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].count()\n",
" else:\n",
" stat_dict[\"User type\"] = (df.loc[(df[\"User ID (receiver)\"]==user)])[\"Receiver type\"].unique()[0]\n",
"\n",
" user_df = df.loc[(df[\"User ID (receiver)\"] == user)]\n",
" if (not user_df.empty):\n",
" stat_dict[\"Unique_senders\"] = len(user_df[\"User ID (sender)\"].unique())\n",
" for tran_type in transaction_types:\n",
" rec_amount_name = f\"Received_amount_{tran_type}\"\n",
" rec_amount_median = f\"Received_amount_{tran_type}_median\"\n",
" rec_amount_min = f\"Received_amount_{tran_type}_min\"\n",
" rec_amount_max = f\"Received_amount_{tran_type}_max\"\n",
" rec_tran_count = f\"Received_{tran_type}_count\"\n",
" stat_dict[rec_amount_name] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].sum()\n",
" stat_dict[rec_amount_median] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].median()\n",
" stat_dict[rec_amount_min] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].min()\n",
" stat_dict[rec_amount_max] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].max()\n",
" stat_dict[rec_tran_count] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].count()\n",
" \n",
" df_temp = pd.DataFrame([stat_dict])\n",
" \n",
" #df_temp.head()\n",
" stat_df = pd.concat([stat_df, df_temp])\n",
" stat_df = stat_df.fillna(0)\n",
" return stat_df\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BcxlXwNWAQeB"
},
"source": [
"Кстати, обратите внимание уникальных пользователей в системе 2009. Это больше, чем число уникальных отправителей и уникальных получателей, значит, какие то пользователи только отправляют деньги, а какие-то только получают."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "O-i5IOdE0JG0",
"outputId": "91e41ccd-e4ab-445d-b9e1-dfda76630166"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['PN_EU_0_0' 'PN_EU_0_1' 'PN_EU_0_10' ... 'PN_Ret5' 'PN_Ret6' 'operator']\n",
"(2009, 54)\n"
]
}
],
"source": [
"stat_df = get_stat_df(df)\n",
"print(stat_df.shape)\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "sFxDkxTvAxwF"
},
"source": [
"Получив статистику по пользователям, вы можете выполнить операцию кластеризации, чтобы посмотреть, какие группы пользователей есть. И уже выполнять анализ данных по группам пользователей. Обработка и построение графиков к Google Collab - достаточно длительный процесс. \n",
"В данном случае, была выбрана часть статистик и построила проекции пользователей. Анализируемые поля были выбраны на основе анализа свойств возможных финансовых аномалий (т.е. просто эвристически:))."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"id": "LsANfoDJjXRh"
},
"outputs": [],
"source": [
"from pandas.plotting import scatter_matrix\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.decomposition import PCA\n",
"from matplotlib.ticker import FormatStrFormatter\n",
"import plotly.express as px"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Мошенничество, связанное с заражением бот-сетью."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "m4zT73PgDZgF"
},
"source": [
"\n",
"\n",
"Согласно описанию сценария атаки: есть множество зараженных пользователей, которые переводят деньги какому-то пользователю (\"ослу\" или \"мулу\"), и уже он выполняет операции обналичивания денег. Рассмотрен простейщий вариант сценария: цепочка мулов состоит из одного звена. "
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mm4AtlIZjYjV",
"outputId": "6201330a-bc7c-49f6-d39d-0c4177282f05"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Explained variance: [0.40133876 0.3262758 0.19799872]\tSum: 0.9256132760462193\n"
]
}
],
"source": [
"\n",
"#оставляем поля, связанные с переводами и снятиями и добавили число уникальных пользователей, это же бот сеть.\n",
"\n",
"MobileBot_labels = ['Unique_receivers','Unique_receivers','Sent_Ind_count' ,'Sent_Wl_count', 'Received_Ind_count']\n",
"\n",
"# а по этим полям будем пробовать найти пользователей с кражей телефона.\n",
"MobileTheft_labels = ['Sent_amount_Wl', 'Sent_amount_Wl_median', 'Sent_amount_Wl_min', 'Sent_amount_Wl_max', 'Sent_Wl_count']\n",
"\n",
"x = stat_df[MobileBot_labels].values\n",
"\n",
"# нормализуем значения\n",
"x = StandardScaler().fit_transform(x)\n",
"\n",
"pca = PCA(n_components=3)\n",
"principalComponents = pca.fit_transform(x)\n",
"print(f'Explained variance: {pca.explained_variance_ratio_}\\tSum: {pca.explained_variance_ratio_.sum()}')\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 542
},
"id": "hd3Eo0vZj3uC",
"outputId": "3dcd6f66-08c6-4759-8856-150624f64914"
},
"outputs": [
{
"data": {
"text/html": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"principalDf = pd.DataFrame(data=principalComponents , columns=['PC1', 'PC2', 'PC3'])\n",
"\n",
"\n",
"\n",
"fig = px.scatter(principalDf, x=\"PC1\", y=\"PC2\", color=stat_df['User type'],\n",
" size=stat_df['Unique_senders'].apply(lambda x: 1 if x == 0 else math.log(x,10) ), \n",
" hover_name=stat_df['User ID'], \n",
" \n",
" opacity = 0.5,\n",
" color_discrete_map={\n",
" 'EU': '#377eb8',\n",
" 'operator': '#e41a1c',\n",
" 'RET': '#4daf4a',\n",
" 'MER': '#984ea3',\n",
" })\n",
"\n",
"#for GOogle colab uncomment\n",
"pio.renderers.default = 'iframe' #comment this lines for GOogle colab\n",
"\n",
"fig.show()\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "I4dIVKypkoHp"
},
"source": [
"Проанализировав график рассеивания, можно увидеть группу пользователей с большим числом уникальных отправителей. Они могут быть \"мулами\" через которые отмываются деньги, проверим эту гипотезу. \n",
"Рассмотрим пользователей: 'PN_EU_0_955', 'PN_EU_0_260', 'PN_EU_1_328', 'PN_0_1045'. ('PN_EU_0_260' и 'PN_0_1045' накладываются друг на друга, не хватает jitter или возможности перемещать объекты).\n",
"А еще есть подозрительный пользователь PN_EU_2_154.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ниже график рассеивания для пары параметров, можно поисследовать возможные зависимости."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = px.scatter(stat_df, x='Sent_amount_Wl_max', y='Sent_amount_Wl_median', color=stat_df['User type'],\n",
" size=stat_df['Unique_senders'].apply(lambda x: 1 if x == 0 else math.log(x,10) ), \n",
" hover_name=stat_df['User ID'], \n",
" \n",
" opacity = 0.5,\n",
" color_discrete_map={\n",
" 'EU': '#377eb8',\n",
" 'operator': '#e41a1c',\n",
" 'RET': '#4daf4a',\n",
" 'MER': '#984ea3',\n",
" })\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "obYJYVFo7PsV"
},
"source": [
"# Граф сети"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "x_ej2qQbdoXc"
},
"source": [
"Создание графа на основе данных с помощью библиотеки Networkx.\n",
"вершины добавлена некоторая статистическая информация. Вы можете модифицировать ее, брать ее из дата ферйма stat_df.\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"id": "RBxh9up7unsA"
},
"outputs": [],
"source": [
"def create_network_for_df(df):\n",
" G = nx.MultiDiGraph()\n",
"\n",
" for _, row in df.iterrows():\n",
" sender_node_id = row[\"User ID (sender)\"]\n",
" receiver_node_id = row[\"User ID (receiver)\"]\n",
" sent_tran_num = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0})\n",
" received_tran_num = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0})\n",
" sent_tran_amount = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0})\n",
" received_tran_amount = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0})\n",
"\n",
" tran_type = row[\"Type of transaction\"]\n",
" if sender_node_id in G.nodes().keys():\n",
" \n",
" \n",
" G.nodes[sender_node_id]['sent_transaction_num'][tran_type]+=1 \n",
" G.nodes[sender_node_id]['sent_transaction_amount'][tran_type]+=row[\"Amount of transaction\"] \n",
" else:\n",
" sent_tran_num[tran_type]=1\n",
" sent_tran_amount[tran_type] = row[\"Amount of transaction\"]\n",
" G.add_node(\n",
" sender_node_id, \n",
" type=row[\"Sender type\"], \n",
" account_id=row[\"User account ID (sender)\"],\n",
" sent_transaction_num = sent_tran_num,\n",
" sent_transaction_amount = sent_tran_amount,\n",
" received_transaction_num = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0}),\n",
" received_transaction_amount = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0})\n",
" )\n",
" \n",
" if receiver_node_id in G.nodes().keys():\n",
" G.nodes[receiver_node_id]['received_transaction_num'][tran_type]+=1\n",
" G.nodes[sender_node_id]['received_transaction_amount'][tran_type]+=row[\"Amount of transaction\"]\n",
" else:\n",
" received_tran_num[tran_type]=1\n",
" received_tran_amount[tran_type]=row[\"Amount of transaction\"]\n",
" G.add_node(\n",
" receiver_node_id, \n",
" type=row[\"Receiver type\"], \n",
" account_id=row[\"User account ID (receiver)\"],\n",
" transaction_num = 1,\n",
" sent_transaction_num = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0}),\n",
" sent_transaction_amount = dict({'Ind': 0, 'Wl': 0, 'Dt':0, 'ArRC':0, 'Merchant':0}),\n",
" received_transaction_num = received_tran_num,\n",
" received_transaction_amount = received_tran_amount\n",
" )\n",
"\n",
" G.add_edge(\n",
" row[\"User ID (sender)\"], \n",
" row[\"User ID (receiver)\"], \n",
" groundtruth=row[\"Groundtruth\"],\n",
" amount=row[\"Amount of transaction\"],\n",
" timestamp=row[\"Transaction timestamp\"],\n",
" type=row[\"Type of transaction\"], \n",
" )\n",
" return G"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "La8KwpfEj0iZ"
},
"source": [
"# Отображения графа с помощью библиотеки Plotly\n",
"\n",
"Отрисовка в Google Collab выолняется достаточно медленно, поэтому предлагаем анализировать граф частями.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "HLnLlos--hyg"
},
"source": [
"**Настройка палитр и внешнего вида**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"id": "TCJIKGNQ6_eS"
},
"outputs": [],
"source": [
"def draw_palette(colors, shapes=None, title=\"\"):\n",
"\n",
" if shapes:\n",
" assert len(colors) == len(shapes)\n",
"\n",
" x = list(colors.keys())\n",
" y = [1 for _ in x]\n",
" color = list(colors.values())\n",
" shape = None\n",
"\n",
" if shapes:\n",
" shape = list(shapes.values())\n",
" else:\n",
" shape = ['circle' for _ in x]\n",
"\n",
" fig = go.Figure( \n",
" data=[\n",
" go.Scatter(\n",
" mode='markers',\n",
" x=x, y=y, \n",
" marker=dict(size=40, \n",
" color=color, \n",
" symbol=shape,\n",
" line=dict(width=0, color='black')))\n",
" ], \n",
" layout=dict(\n",
" title_text=title, \n",
" yaxis=dict(visible=False, \n",
" showticklabels=False)\n",
" )\n",
" )\n",
"\n",
" fig.show()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "CJLJfaAx9BgH"
},
"outputs": [],
"source": [
"GROUNDTRUTH_TO_COLOR = {\n",
" 'N_RegC2C': '#666666', \n",
" 'N_RegDep': '#666666', \n",
" 'N_Reg_RC': '#666666', \n",
" 'N_RegWith': '#666666',\n",
" 'N_Reg_Merch': '#666666',\n",
" 'F_Mule_With': '#990099',\n",
" 'F_bot': '#EECA3B',\n",
" 'F_SevWith': '#EF553B',\n",
"}\n",
"#палитра set 1 из ColorBrewer \n",
"#e41a1c красный\n",
"#377eb8 синий\n",
"#4daf4a зеленый\n",
"#984ea3 фиолетовый\n",
"TYPE_TO_COLOR = {\n",
" 'EU': '#377eb8',\n",
" 'operator': '#e41a1c',\n",
" 'RET': '#4daf4a',\n",
" 'MER': '#984ea3',\n",
"}\n",
"\n",
"TYPE_TO_SHAPE = {\n",
" 'EU': 'circle',\n",
" 'operator': 'square',\n",
" 'RET': 'diamond',\n",
" 'MER': 'star',\n",
"}\n",
"\n",
"IS_CRIMINAL_TO_COLOR_SIZE_NODE = {\n",
" True: ('#EF553B', 15, 3),\n",
" False: ('gray', 15, 0)\n",
"}\n",
"\n",
"IS_CRIMINAL_TO_COLOR_SIZE_EDGE = {\n",
" True: ('#EF553B', 2),\n",
" False: ('#666666', 1)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"id": "S2vbZCes8EeT"
},
"outputs": [
{
"data": {
"text/html": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"draw_palette(TYPE_TO_COLOR, TYPE_TO_SHAPE, title=\"User type palette\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9tTOCilXkGN7"
},
"source": [
"Настройка внешнего вида узлов графа\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"id": "kiguhNOEdg5e"
},
"outputs": [],
"source": [
"def make_node(layout, node, data, is_criminal):\n",
" x, y = layout[node]\n",
"\n",
" type_t = data['type']\n",
" \n",
" sent_tran_num = sum(data[\"sent_transaction_num\"].values())\n",
" received_tran_num = sum(data[\"received_transaction_num\"].values())\n",
"\n",
" sent_tran_amount = sum(data[\"sent_transaction_amount\"].values())\n",
" received_tran_amount = sum(data[\"received_transaction_amount\"].values())\n",
"\n",
"\n",
" color, size, width = IS_CRIMINAL_TO_COLOR_SIZE_NODE[is_criminal]\n",
"\n",
" user = px.scatter(\n",
" x=[x, None],\n",
" y=[y, None],\n",
" text = [f\"{node}\", None],\n",
" hover_name=[f\"{node}
\"\n",
" #f\"Criminal: {is_criminal}
\"\n",
" f\"Type: {type_t}
\"\n",
" f\"Sent transactions (num)):{sent_tran_num}
\"\n",
" f\"Received transactions (num):{received_tran_num}
\"\n",
" f\"Sent transactions (amount):{sent_tran_amount}
\"\n",
" f\"Received transactions (amount):{received_tran_amount}
\"\n",
" ,None]\n",
" )\n",
" \n",
" \n",
" user.update_traces(\n",
" marker=dict(\n",
" color=TYPE_TO_COLOR[type_t],\n",
" size=size, \n",
" line=dict(\n",
" width=width,\n",
" color=color\n",
" ),\n",
" symbol=TYPE_TO_SHAPE[type_t]\n",
" ),\n",
" textposition='bottom center',\n",
" textfont_size=8\n",
" )\n",
"\n",
" return user\n",
"# \n",
"def make_edge(layout, node_id1, node_id2, G, criminals):\n",
" x0, y0 = layout[node_id1]\n",
" x1, y1 = layout[node_id2]\n",
"\n",
" node1 = G.nodes[node_id1] \n",
" node2 = G.nodes[node_id2]\n",
"\n",
" is_sender_criminal = True if node_id1 in criminals else False\n",
" is_receiver_criminal = True if node_id2 in criminals else False\n",
"\n",
" color, size = IS_CRIMINAL_TO_COLOR_SIZE_EDGE[is_sender_criminal]\n",
"\n",
" # Edge\n",
" trace = px.line(\n",
" x=[x0, x1, None], \n",
" y=[y0, y1, None],\n",
" hover_data=None,\n",
" )\n",
" \n",
" trace.update_traces(\n",
" line_color=color,\n",
" line_width=size,\n",
" hovertemplate = None,\n",
" hoverinfo = \"skip\",\n",
" )\n",
"\n",
" node1_trace = make_node(layout, node_id1, node1, is_sender_criminal)\n",
" node2_trace = make_node(layout, node_id2, node2, is_receiver_criminal)\n",
"\n",
" return trace.data, (node1_trace.data + node2_trace.data)\n",
"\n",
"\n",
"def draw_network(G, title, crime_type=\"\", criminals=set()): \n",
" layout = nx.nx_agraph.graphviz_layout(G, prog = 'twopi')\n",
" #полезными могут быть укладки dot и twopi (параметр prog)\n",
" \n",
" #Поэкспериментируйте с разными раскладками\n",
" \n",
" #layout = nx.spring_layout(G)\n",
" #layout = nx.circular_layout(G)\n",
" #layout = nx.kamada_kawai_layout(G, scale = 10 )\n",
" \n",
"\n",
" edges_data = ()\n",
" nodes_data = ()\n",
" for node1, node2, _ in G.edges(data=True):\n",
" edge, nodes = make_edge(layout, node1, node2, G, criminals)\n",
" edges_data += edge\n",
" nodes_data += nodes\n",
"\n",
" all_fig = go.Figure((*edges_data, *nodes_data))\n",
" all_fig.update_layout(autosize=True, width=900, height=900)\n",
" all_fig.write_html(f\"{crime_type}_{title}_plotly.html\")\n",
" all_fig.show()\n",
"\n",
"\n",
"\n",
"def draw_network_for_df(df, title, crime_type=\"\", criminals=set()):\n",
" G = create_network_for_df(df)\n",
" draw_network(G, title, crime_type, criminals)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BgjvGe-Qkl5Y"
},
"source": [
"Построим граф контактов для выбранных ранее пользователей 'PN_EU_0_955', 'PN_EU_0_260', 'PN_EU_1_328'"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "C8aJ84NbD98a",
"outputId": "e4853446-ebb3-41be-d6ec-9867a76fa7c7"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(1207, 10)\n"
]
}
],
"source": [
"df = df.sort_values(by=['Transaction timestamp'], ascending=True)\n",
"suspected_users = ['PN_EU_0_955', 'PN_EU_0_260', 'PN_EU_1_328', 'PN_0_1045']\n",
"#suspected_users = ['PN_EU_2_154'] # постройте граф контактов для подозрительного пользователя\n",
"#suspected_users = ['PN_EU_0_955'] # постройте граф контактов для мула и сравните с подозрительным\n",
"df_selected = df.loc[((df[\"User ID (sender)\"].isin(suspected_users)) | (df[\"User ID (receiver)\"].isin(suspected_users)))]\n",
"print(df_selected.shape)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"id": "HK9HsBL9ox5-"
},
"outputs": [],
"source": [
"G = create_network_for_df(df_selected)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 937
},
"id": "A5cKzETHofa_",
"outputId": "d0e55ff8-bd9f-4766-aeed-80a61dfa502d",
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"draw_network(G, \"Структура мошенничества\",\"Мобильный ботнет\" )"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "GsSPmM_3ro6v"
},
"source": [
"Давайте опишем, что мы видим.\n",
"Очевидно, что пользователи 'PN_EU_0_955', 'PN_EU_0_260', 'PN_EU_1_328', 'PN_0_1045' имеют одинаковый характер финансовых транзакций (постройте граф для каждого пользователя). Множество пользователей, которое одинаково для всех четырех пользователей, пересылают деньги выбранным 4 пользотелям, которые в последствии выполняют снятие электронных денег. Данная схема очень похожа на искомую схему заражения мобильным ботом. Множество пользователей - это зараженные узлы (ниже определим это множество), а получатели транзакций - очевидно подставные пользователи (мулы).\n",
"\n",
"Обратите внимание, что увидеть взаимосвязи явно можно только с определенным способом укладки вершин графа. ПРоэкспериментируйте с различными алгоритмами укладки!\n",
"\n",
"\n",
"\n",
"Проанализируйте самостоятельно поведение пользователя PN_EU_2_154, убедитесь, что у него качественно другое поведение, хотя он достаточно активен в системе.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
" Сформируем множество потенциально зараженных узлов. "
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['PN_EU_0_468' 'PN_EU_0_143' 'PN_EU_0_1209' 'PN_EU_0_1166' 'PN_EU_0_248'\n",
" 'PN_EU_1_211' 'PN_EU_0_741' 'PN_EU_0_1256' 'PN_EU_0_761' 'PN_EU_0_704'\n",
" 'PN_EU_0_227' 'PN_EU_0_501' 'PN_EU_0_1032' 'PN_EU_0_870' 'PN_EU_0_687'\n",
" 'PN_EU_0_5' 'PN_EU_0_767' 'PN_EU_0_668' 'PN_EU_1_8' 'PN_EU_1_352'\n",
" 'PN_EU_2_134' 'PN_EU_0_789' 'PN_EU_0_1113' 'PN_EU_1_508' 'PN_EU_2_57'\n",
" 'PN_EU_0_49' 'PN_EU_0_19' 'PN_EU_3_5' 'PN_EU_0_298' 'PN_EU_1_99'\n",
" 'PN_EU_0_87' 'PN_EU_2_51' 'PN_EU_1_495' 'PN_EU_0_379' 'PN_EU_2_128'\n",
" 'PN_EU_1_437' 'PN_EU_0_888' 'PN_EU_0_779' 'PN_EU_0_476' 'PN_EU_1_114'\n",
" 'PN_EU_1_449' 'PN_EU_0_122' 'PN_EU_1_167' 'PN_EU_1_26' 'PN_EU_3_29']\n"
]
}
],
"source": [
"infected_users = (df.loc[((df[\"User ID (receiver)\"].isin(suspected_users)) & (df['Type of transaction']=='Ind'))][\"User ID (sender)\"]).unique()\n",
"print(infected_users)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XmxEO8CBupe1"
},
"source": [
"# Оценка точности предположения"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "N58btar078vP"
},
"source": [
"Теперь оценим точность нашего предположения. \n",
"**Внимание!** Это можно сделать только, имея размеченные данные. В вашей лабораторной работы таких сведений нет.\n"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {
"id": "MwjpUAmquoSf"
},
"outputs": [],
"source": [
"def check_accuracy(df_suspected, df_groundtruth, columns_to_check=[]):\n",
" is_criminal_mapping = {\n",
" 'N_RegC2C': False, \n",
" 'N_RegDep': False, \n",
" 'N_Reg_RC': False, \n",
" 'N_RegWith': False, \n",
" 'N_Reg_Merch': False, \n",
" 'F_Mule_With': True,\n",
" 'F_bot': True,\n",
" 'F_SevWith': True,\n",
" }\n",
" \n",
" \n",
" df_with_required_columns = df_groundtruth[df_groundtruth['Groundtruth'].isin(columns_to_check)].copy()\n",
" \n",
" df_all = pd.concat([ df_with_required_columns,df_suspected])\n",
" true_count = df_all.duplicated(keep='first').sum() #возможно это не самый лучший вариант\n",
" \n",
" #\n",
" accuracy = true_count/df_with_required_columns.shape[0]\n",
" \n",
" fpr = (df_suspected.shape[0] - true_count)/(df_groundtruth.shape[0]-df_with_required_columns.shape[0])\n",
" \n",
" return accuracy, fpr"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "YfURKz23tIlr",
"outputId": "d9a90a01-53e6-4f6d-c86c-aa1091349f60"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Infected accounts detection: accuracy = 0.7614424410540915, FPR = 0.00025865095054224325\n",
"Mule accounts detection: accuracy = 0.7629009762900977, FPR = 7.3894810736916e-05\n"
]
}
],
"source": [
"#infected accounts\n",
"\n",
"df_infected_transactions = df.loc[((df[\"User ID (receiver)\"].isin(suspected_users)) & (df['Type of transaction']=='Ind'))]\n",
"accuracy, fpr = check_accuracy(df_infected_transactions, df, ['F_bot'])\n",
"print(f\"Infected accounts detection: accuracy = {accuracy}, FPR = {fpr}\" )\n",
"df_infected_withdrawals = df.loc[((df[\"User ID (sender)\"].isin(suspected_users)) & (df['Type of transaction']=='Wl'))]\n",
"accuracy, fpr = check_accuracy(df_infected_withdrawals, df, ['F_Mule_With'])\n",
"print(f\"Mule accounts detection: accuracy = {accuracy}, FPR = {fpr}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Точность обнаружения не высока, это означает, обнаружили не все транзакции, но структуру бота обнаружили верно. Проверить это можно, используя уже groundtruth и визуализировать не обнаруженные транзакции. \n",
"\n",
"Попробуйте это сделать, настроив внешний вид узлов графа соответствующим образом.\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "RPwe323ykLzd"
},
"source": [
"# Отображения графа с помощью библиотеки Pyvis "
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "wB6guMNs_BL7"
},
"source": [
"Предыдущие визуализации графа были неплохи, но они не интерактивны, иногда очень хотелось сдвинуть вершины. Это можно сделать с помощью библиотеки PyVis. Построим с помощью этой библиотеку графы контактов пользователей мулов и подозрительно пользователя \n",
"#suspected_users = ['PN_EU_2_154']\n",
"#suspected_users = ['PN_EU_0_955']"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"id": "nMCCbmImGON4"
},
"outputs": [],
"source": [
"def draw_pyvis_network_for_df(df, title):\n",
" nt = net.Network(\n",
" height='900px', width='100%',\n",
" heading=f\"{title}\",\n",
" directed=True,\n",
" notebook=True,\n",
"\n",
" cdn_resources='remote'\n",
" ) \n",
"\n",
" for _, row in df.iterrows():\n",
" \n",
" \n",
" nt.add_node(\n",
" row[\"User ID (sender)\"], \n",
" label=row[\"User ID (sender)\"], \n",
" shape=TYPE_TO_SHAPE[row[\"Sender type\"]], #имхо избыточно\n",
" color=TYPE_TO_COLOR[row[\"Sender type\"]],\n",
" title=row[\"User ID (sender)\"],\n",
" )\n",
"\n",
" nt.add_node(\n",
" row[\"User ID (receiver)\"], \n",
" label=row[\"User ID (receiver)\"], \n",
" shape=TYPE_TO_SHAPE[row[\"Receiver type\"]],\n",
" color= TYPE_TO_COLOR[row[\"Receiver type\"]],\n",
" #title=f\"{'Criminal' if is_receiver_criminal else None}\",\n",
" )\n",
"# можно поэкспериментировать с цветом ребра в зависимости от размера перевода\n",
" nt.add_edge(\n",
" row[\"User ID (sender)\"], \n",
" row[\"User ID (receiver)\"],\n",
" color='#666666',\n",
" width=1,\n",
" title=f'Amount: {row[\"Amount of transaction\"]}',\n",
" label=f\"{row['Type of transaction']}: {row['Amount of transaction']}\"\n",
" )\n",
"\n",
" return nt"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "IIURLuSHCFDl"
},
"source": [
"Вновь отрисуем граф контактов пользователей, чьи устройства заражены мобильной бот сетью."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"id": "zHqc4QbpClRt"
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Groundtruth | \n",
" User ID (sender) | \n",
" User ID (receiver) | \n",
" User account ID (sender) | \n",
" User account ID (receiver) | \n",
" Amount of transaction | \n",
" Type of transaction | \n",
" Transaction timestamp | \n",
" Sender type | \n",
" Receiver type | \n",
"
\n",
" \n",
" \n",
" \n",
" 13105 | \n",
" F_Mule_With | \n",
" PN_EU_0_955 | \n",
" PN_Ret1 | \n",
" EUAcc0_955 | \n",
" RAcc1 | \n",
" 1776.92 | \n",
" Wl | \n",
" 2011-01-07 20:34:22 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 13130 | \n",
" F_bot | \n",
" PN_EU_0_1209 | \n",
" PN_EU_0_955 | \n",
" EUAcc0_1209 | \n",
" EUAcc0_955 | \n",
" 15852.87 | \n",
" Ind | \n",
" 2011-01-07 22:31:14 | \n",
" EU | \n",
" EU | \n",
"
\n",
" \n",
" 41283 | \n",
" F_Mule_With | \n",
" PN_EU_0_955 | \n",
" PN_Ret6 | \n",
" EUAcc0_955 | \n",
" RAcc6 | \n",
" 12042.49 | \n",
" Wl | \n",
" 2011-01-09 12:23:21 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 13532 | \n",
" F_Mule_With | \n",
" PN_EU_0_955 | \n",
" PN_Ret3 | \n",
" EUAcc0_955 | \n",
" RAcc3 | \n",
" 15694.34 | \n",
" Wl | \n",
" 2011-02-07 20:22:03 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 13552 | \n",
" F_bot | \n",
" PN_EU_0_761 | \n",
" PN_EU_0_955 | \n",
" EUAcc0_761 | \n",
" EUAcc0_955 | \n",
" 15528.15 | \n",
" Ind | \n",
" 2011-02-07 21:30:22 | \n",
" EU | \n",
" EU | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Groundtruth User ID (sender) User ID (receiver) \\\n",
"13105 F_Mule_With PN_EU_0_955 PN_Ret1 \n",
"13130 F_bot PN_EU_0_1209 PN_EU_0_955 \n",
"41283 F_Mule_With PN_EU_0_955 PN_Ret6 \n",
"13532 F_Mule_With PN_EU_0_955 PN_Ret3 \n",
"13552 F_bot PN_EU_0_761 PN_EU_0_955 \n",
"\n",
" User account ID (sender) User account ID (receiver) \\\n",
"13105 EUAcc0_955 RAcc1 \n",
"13130 EUAcc0_1209 EUAcc0_955 \n",
"41283 EUAcc0_955 RAcc6 \n",
"13532 EUAcc0_955 RAcc3 \n",
"13552 EUAcc0_761 EUAcc0_955 \n",
"\n",
" Amount of transaction Type of transaction Transaction timestamp \\\n",
"13105 1776.92 Wl 2011-01-07 20:34:22 \n",
"13130 15852.87 Ind 2011-01-07 22:31:14 \n",
"41283 12042.49 Wl 2011-01-09 12:23:21 \n",
"13532 15694.34 Wl 2011-02-07 20:22:03 \n",
"13552 15528.15 Ind 2011-02-07 21:30:22 \n",
"\n",
" Sender type Receiver type \n",
"13105 EU RET \n",
"13130 EU EU \n",
"41283 EU RET \n",
"13532 EU RET \n",
"13552 EU EU "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#suspected_users = ['PN_EU_2_154']\n",
"suspected_users = ['PN_EU_0_955']\n",
"#suspected_users = ['PN_EU_0_955', 'PN_EU_0_260', 'PN_EU_1_328', 'PN_0_1045']\n",
"df_selected = df.loc[((df[\"User ID (sender)\"].isin(suspected_users)) | (df[\"User ID (receiver)\"].isin(suspected_users)))]\n",
"df_selected.head()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
},
"id": "ZT7Ai0TZD4fJ",
"outputId": "cf2a30c6-ed92-46da-b823-8b17183cec7f"
},
"outputs": [],
"source": [
"n = draw_pyvis_network_for_df(df_selected, \"Mobile botnet infection\")\n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"PN_EU_0_955.html\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n.toggle_physics(False)\n",
"#можно настраивать силы притяженяи и отталкивания\n",
"n.show_buttons(filter_=['physics'])\n",
"n.show(\"PN_EU_0_955.html\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "_6gtEXwkHALL"
},
"source": [
"# Кража телефона"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "VvPo27lYu-z7"
},
"source": [
"## Обнаружение подозрительных транзакций"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ниже представлено альтернативное решение поиска случае кражи телефона, основанные на анализе финансовых транзакций: фильтруются все транзакции за один день и формируются множество транзакций, сумма переводов которых меньше средней суммы за день."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tdk-wAJzIL4c"
},
"source": [
"## Подготовка данных "
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "k2ljMC6YhXXL"
},
"source": [
"Признаки кражи:\n",
"\n",
"- Сумма мошеннических операций меньше, чем средняя сумма пользователей\n",
"- Мошенник пытается снять деньги несколько раз в течение короткого промежутка времени"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"id": "DslAQU_Tu0js"
},
"outputs": [],
"source": [
"df = df.sort_values(by=['Transaction timestamp'], ascending=True)\n",
"dfs_by_day = [g for _, g in df.groupby(df['Transaction timestamp'].dt.date)]\n",
"df_day = dfs_by_day[17] "
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {
"id": "QKorGEkcMT8G"
},
"outputs": [],
"source": [
"def get_theft_df(df):\n",
" mean_amount = df[\"Amount of transaction\"].mean()\n",
"\n",
" median_df = (\n",
" df[['User ID (sender)', 'Type of transaction', 'Sender type']]\n",
" .groupby(['User ID (sender)', 'Type of transaction', 'Sender type'])\n",
" .size()\n",
" .groupby(['Type of transaction', 'Sender type'])\n",
" .median()\n",
" .to_frame(name='Median number of transactions')\n",
" .reset_index()\n",
" .sort_values(by=['Sender type'], ascending=True))\n",
"\n",
" median_wl = median_df.loc[median_df['Type of transaction'] == 'Wl', 'Median number of transactions'].iloc[0]\n",
" # Filter withdrawals with amount less than mean_amount\n",
" theft_df = df[(df['Type of transaction'] == 'Wl') & (df[\"Amount of transaction\"] < mean_amount)].copy()\n",
"\n",
" # Select users, who made more than median_number_of_transactions_by_type withdrawals\n",
" suspected_users = theft_df.groupby(['User ID (sender)'])['User ID (sender)'].transform('count') > median_wl\n",
" \n",
" theft_df.insert(0, 'Suspected', False)\n",
" theft_df.loc[:, 'Suspected'] = suspected_users\n",
"\n",
" suspected_senders_set = set(theft_df.loc[theft_df['Suspected'] == True, 'User ID (sender)'])\n",
" suspected_receivers_set = set(theft_df.loc[theft_df['Suspected'] == True, 'User ID (receiver)'])\n",
"\n",
" criminals = (suspected_senders_set | suspected_receivers_set)\n",
"\n",
" return theft_df, criminals"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 354
},
"id": "8Un-A6RCEfVx",
"outputId": "4f375b2d-6eb6-4b95-87c8-f20b7cb933bb"
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Suspected | \n",
" Groundtruth | \n",
" User ID (sender) | \n",
" User ID (receiver) | \n",
" User account ID (sender) | \n",
" User account ID (receiver) | \n",
" Amount of transaction | \n",
" Type of transaction | \n",
" Transaction timestamp | \n",
" Sender type | \n",
" Receiver type | \n",
"
\n",
" \n",
" \n",
" \n",
" 1018 | \n",
" False | \n",
" N_RegWith | \n",
" PN_EU_1_72 | \n",
" PN_Ret6 | \n",
" EUAcc1_72 | \n",
" RAcc6 | \n",
" 21279.29 | \n",
" Wl | \n",
" 2011-05-06 04:15:47 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 1075 | \n",
" False | \n",
" N_RegWith | \n",
" PN_EU_1_105 | \n",
" PN_Ret2 | \n",
" EUAcc1_105 | \n",
" RAcc2 | \n",
" 16831.24 | \n",
" Wl | \n",
" 2011-05-06 11:04:29 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 1083 | \n",
" False | \n",
" N_RegWith | \n",
" PN_EU_3_4 | \n",
" PN_Ret6 | \n",
" EUAcc3_4 | \n",
" RAcc6 | \n",
" 36518.17 | \n",
" Wl | \n",
" 2011-05-06 11:45:31 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 1140 | \n",
" False | \n",
" F_Mule_With | \n",
" PN_EU_1_328 | \n",
" PN_Ret4 | \n",
" EUAcc1_328 | \n",
" RAcc4 | \n",
" 23505.74 | \n",
" Wl | \n",
" 2011-05-06 16:12:54 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 1168 | \n",
" False | \n",
" F_Mule_With | \n",
" PN_EU_0_1045 | \n",
" PN_Ret5 | \n",
" EUAcc0_1045 | \n",
" RAcc5 | \n",
" 4703.50 | \n",
" Wl | \n",
" 2011-05-06 19:11:16 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Suspected Groundtruth User ID (sender) User ID (receiver) \\\n",
"1018 False N_RegWith PN_EU_1_72 PN_Ret6 \n",
"1075 False N_RegWith PN_EU_1_105 PN_Ret2 \n",
"1083 False N_RegWith PN_EU_3_4 PN_Ret6 \n",
"1140 False F_Mule_With PN_EU_1_328 PN_Ret4 \n",
"1168 False F_Mule_With PN_EU_0_1045 PN_Ret5 \n",
"\n",
" User account ID (sender) User account ID (receiver) \\\n",
"1018 EUAcc1_72 RAcc6 \n",
"1075 EUAcc1_105 RAcc2 \n",
"1083 EUAcc3_4 RAcc6 \n",
"1140 EUAcc1_328 RAcc4 \n",
"1168 EUAcc0_1045 RAcc5 \n",
"\n",
" Amount of transaction Type of transaction Transaction timestamp \\\n",
"1018 21279.29 Wl 2011-05-06 04:15:47 \n",
"1075 16831.24 Wl 2011-05-06 11:04:29 \n",
"1083 36518.17 Wl 2011-05-06 11:45:31 \n",
"1140 23505.74 Wl 2011-05-06 16:12:54 \n",
"1168 4703.50 Wl 2011-05-06 19:11:16 \n",
"\n",
" Sender type Receiver type \n",
"1018 EU RET \n",
"1075 EU RET \n",
"1083 EU RET \n",
"1140 EU RET \n",
"1168 EU RET "
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"theft_df, criminals = get_theft_df(df_day)\n",
"theft_df.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "td5oXAnyvFmA"
},
"source": [
"## Graph"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"G = create_network_for_df(theft_df)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 917
},
"id": "SRSWLmhEIc1v",
"outputId": "38c79c31-3036-45d4-cb4a-8c35aa061b3f"
},
"outputs": [
{
"data": {
"text/html": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"draw_network(G, \"Структура мошенничества\",\"Кража мобильного телефона\" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"А можно проанализровать структурные связи пользователей, которые формируют небольшую группу, которые обнаружили при построении проекции данных для атрибутов: MobileTheft_labels = ['Sent_amount_Wl',\t'Sent_amount_Wl_median',\t'Sent_amount_Wl_min',\t'Sent_amount_Wl_max',\t'Sent_Wl_count']. Выберем, пользователя PN_EU_0_77\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Groundtruth | \n",
" User ID (sender) | \n",
" User ID (receiver) | \n",
" User account ID (sender) | \n",
" User account ID (receiver) | \n",
" Amount of transaction | \n",
" Type of transaction | \n",
" Transaction timestamp | \n",
" Sender type | \n",
" Receiver type | \n",
"
\n",
" \n",
" \n",
" \n",
" 13714 | \n",
" N_RegDep | \n",
" PN_Ret5 | \n",
" PN_EU_0_77 | \n",
" RAcc5 | \n",
" EUAcc0_77 | \n",
" 28793.31 | \n",
" Dt | \n",
" 2011-03-07 05:32:34 | \n",
" RET | \n",
" EU | \n",
"
\n",
" \n",
" 42151 | \n",
" N_RegC2C | \n",
" PN_EU_2_0 | \n",
" PN_EU_0_77 | \n",
" EUAcc2_0 | \n",
" EUAcc0_77 | \n",
" 26474.59 | \n",
" Ind | \n",
" 2011-03-09 10:56:12 | \n",
" EU | \n",
" EU | \n",
"
\n",
" \n",
" 43105 | \n",
" N_RegDep | \n",
" PN_Ret5 | \n",
" PN_EU_0_77 | \n",
" RAcc5 | \n",
" EUAcc0_77 | \n",
" 70363.71 | \n",
" Dt | \n",
" 2011-05-09 13:45:51 | \n",
" RET | \n",
" EU | \n",
"
\n",
" \n",
" 23305 | \n",
" N_RegDep | \n",
" PN_Ret6 | \n",
" PN_EU_0_77 | \n",
" RAcc6 | \n",
" EUAcc0_77 | \n",
" 103681.73 | \n",
" Dt | \n",
" 2011-07-24 03:40:14 | \n",
" RET | \n",
" EU | \n",
"
\n",
" \n",
" 25522 | \n",
" N_RegDep | \n",
" PN_Ret5 | \n",
" PN_EU_0_77 | \n",
" RAcc5 | \n",
" EUAcc0_77 | \n",
" 142838.85 | \n",
" Dt | \n",
" 2011-07-28 21:30:11 | \n",
" RET | \n",
" EU | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Groundtruth User ID (sender) User ID (receiver) \\\n",
"13714 N_RegDep PN_Ret5 PN_EU_0_77 \n",
"42151 N_RegC2C PN_EU_2_0 PN_EU_0_77 \n",
"43105 N_RegDep PN_Ret5 PN_EU_0_77 \n",
"23305 N_RegDep PN_Ret6 PN_EU_0_77 \n",
"25522 N_RegDep PN_Ret5 PN_EU_0_77 \n",
"\n",
" User account ID (sender) User account ID (receiver) \\\n",
"13714 RAcc5 EUAcc0_77 \n",
"42151 EUAcc2_0 EUAcc0_77 \n",
"43105 RAcc5 EUAcc0_77 \n",
"23305 RAcc6 EUAcc0_77 \n",
"25522 RAcc5 EUAcc0_77 \n",
"\n",
" Amount of transaction Type of transaction Transaction timestamp \\\n",
"13714 28793.31 Dt 2011-03-07 05:32:34 \n",
"42151 26474.59 Ind 2011-03-09 10:56:12 \n",
"43105 70363.71 Dt 2011-05-09 13:45:51 \n",
"23305 103681.73 Dt 2011-07-24 03:40:14 \n",
"25522 142838.85 Dt 2011-07-28 21:30:11 \n",
"\n",
" Sender type Receiver type \n",
"13714 RET EU \n",
"42151 EU EU \n",
"43105 RET EU \n",
"23305 RET EU \n",
"25522 RET EU "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"suspected_theft = ['PN_EU_0_77']\n",
"df_selected = df.loc[((df[\"User ID (sender)\"].isin(suspected_theft)) | (df[\"User ID (receiver)\"].isin(suspected_theft)))]\n",
"df_selected.head()"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"n = draw_pyvis_network_for_df(df_selected, \"Mobile phone theft \")\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"'PN_EU_0_77'.html\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n.toggle_physics(False)\n",
"#можно настраивать силы притяженяи и отталкивания\n",
"n.show_buttons(filter_=['physics'])\n",
"n.show(\"'PN_EU_0_77'.html\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Что мы увидели на этом графе: пользователь PN_EU_0_77 обращается к 2 агентам для пополнения электронного кошелька, а к 3 агентам для снятия электронных денег (при чем к 2 агентам Ret1 и Ret 2 только для снятия). По сути мы не можем на основании этих данных делать какие либо выводы. Но предлагаю просмотреть логи транзакций между пользователем и этими агентами, оценить частоту (а заодно посмотреть на groundtruth). "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Groundtruth | \n",
" User ID (sender) | \n",
" User ID (receiver) | \n",
" User account ID (sender) | \n",
" User account ID (receiver) | \n",
" Amount of transaction | \n",
" Type of transaction | \n",
" Transaction timestamp | \n",
" Sender type | \n",
" Receiver type | \n",
"
\n",
" \n",
" \n",
" \n",
" 2634 | \n",
" F_SevWith | \n",
" PN_EU_0_77 | \n",
" PN_Ret2 | \n",
" EUAcc0_77 | \n",
" RAcc2 | \n",
" 5229.26 | \n",
" Wl | \n",
" 2011-09-06 16:56:51 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 2636 | \n",
" F_SevWith | \n",
" PN_EU_0_77 | \n",
" PN_Ret4 | \n",
" EUAcc0_77 | \n",
" RAcc4 | \n",
" 382.89 | \n",
" Wl | \n",
" 2011-09-06 17:00:20 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 2638 | \n",
" F_SevWith | \n",
" PN_EU_0_77 | \n",
" PN_Ret1 | \n",
" EUAcc0_77 | \n",
" RAcc1 | \n",
" 2682.01 | \n",
" Wl | \n",
" 2011-09-06 17:03:12 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 2640 | \n",
" F_SevWith | \n",
" PN_EU_0_77 | \n",
" PN_Ret4 | \n",
" EUAcc0_77 | \n",
" RAcc4 | \n",
" 1058.97 | \n",
" Wl | \n",
" 2011-09-06 17:06:10 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Groundtruth User ID (sender) User ID (receiver) User account ID (sender) \\\n",
"2634 F_SevWith PN_EU_0_77 PN_Ret2 EUAcc0_77 \n",
"2636 F_SevWith PN_EU_0_77 PN_Ret4 EUAcc0_77 \n",
"2638 F_SevWith PN_EU_0_77 PN_Ret1 EUAcc0_77 \n",
"2640 F_SevWith PN_EU_0_77 PN_Ret4 EUAcc0_77 \n",
"\n",
" User account ID (receiver) Amount of transaction Type of transaction \\\n",
"2634 RAcc2 5229.26 Wl \n",
"2636 RAcc4 382.89 Wl \n",
"2638 RAcc1 2682.01 Wl \n",
"2640 RAcc4 1058.97 Wl \n",
"\n",
" Transaction timestamp Sender type Receiver type \n",
"2634 2011-09-06 16:56:51 EU RET \n",
"2636 2011-09-06 17:00:20 EU RET \n",
"2638 2011-09-06 17:03:12 EU RET \n",
"2640 2011-09-06 17:06:10 EU RET "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_suspected_transactions = df.loc[((df[\"User ID (sender)\"] == \"PN_EU_0_77\") & df['User ID (receiver)'].isin([\"PN_Ret1\",\"PN_Ret2\", \"PN_Ret4\"]))]\n",
"df_suspected_transactions.head()\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Обратите внимание на временные метки этих транзакций: все 4 транзакции осуществляются в течение 10 минут! Следовательно, именно частота операций снятия должна стать основным признаком для выявления данной аномалии. "
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aP1G0wY88PMU"
},
"source": [
"## Поиск всех краж телефона"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Проанализировав графы, построенные для ряда пользователей из группы (PN_EU_0_720, PN_EU_0_77, PN_EU_0_472), можно увидеть, что для этих пользователей характерно большое число связей с агентами (зеленые ромбы), а также небольшие суммы транзакций не больше 10000. Давайте предположим, что именно они являются жертвами кражи. (частоту транзакций не учитывам тут.)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Groundtruth | \n",
" User ID (sender) | \n",
" User ID (receiver) | \n",
" User account ID (sender) | \n",
" User account ID (receiver) | \n",
" Amount of transaction | \n",
" Type of transaction | \n",
" Transaction timestamp | \n",
" Sender type | \n",
" Receiver type | \n",
"
\n",
" \n",
" \n",
" \n",
" 147 | \n",
" N_RegWith | \n",
" PN_EU_3_8 | \n",
" PN_Ret6 | \n",
" EUAcc3_8 | \n",
" RAcc6 | \n",
" 5601.24 | \n",
" Wl | \n",
" 2011-01-06 18:56:20 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 12759 | \n",
" N_RegWith | \n",
" PN_EU_2_41 | \n",
" PN_Ret5 | \n",
" EUAcc2_41 | \n",
" RAcc5 | \n",
" 5601.31 | \n",
" Wl | \n",
" 2011-01-07 02:36:41 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 12905 | \n",
" F_Mule_With | \n",
" PN_EU_1_328 | \n",
" PN_Ret2 | \n",
" EUAcc1_328 | \n",
" RAcc2 | \n",
" 6059.96 | \n",
" Wl | \n",
" 2011-01-07 10:31:04 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 12930 | \n",
" F_Mule_With | \n",
" PN_EU_0_1045 | \n",
" PN_Ret3 | \n",
" EUAcc0_1045 | \n",
" RAcc3 | \n",
" 2175.82 | \n",
" Wl | \n",
" 2011-01-07 11:39:25 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
" 13105 | \n",
" F_Mule_With | \n",
" PN_EU_0_955 | \n",
" PN_Ret1 | \n",
" EUAcc0_955 | \n",
" RAcc1 | \n",
" 1776.92 | \n",
" Wl | \n",
" 2011-01-07 20:34:22 | \n",
" EU | \n",
" RET | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Groundtruth User ID (sender) User ID (receiver) \\\n",
"147 N_RegWith PN_EU_3_8 PN_Ret6 \n",
"12759 N_RegWith PN_EU_2_41 PN_Ret5 \n",
"12905 F_Mule_With PN_EU_1_328 PN_Ret2 \n",
"12930 F_Mule_With PN_EU_0_1045 PN_Ret3 \n",
"13105 F_Mule_With PN_EU_0_955 PN_Ret1 \n",
"\n",
" User account ID (sender) User account ID (receiver) \\\n",
"147 EUAcc3_8 RAcc6 \n",
"12759 EUAcc2_41 RAcc5 \n",
"12905 EUAcc1_328 RAcc2 \n",
"12930 EUAcc0_1045 RAcc3 \n",
"13105 EUAcc0_955 RAcc1 \n",
"\n",
" Amount of transaction Type of transaction Transaction timestamp \\\n",
"147 5601.24 Wl 2011-01-06 18:56:20 \n",
"12759 5601.31 Wl 2011-01-07 02:36:41 \n",
"12905 6059.96 Wl 2011-01-07 10:31:04 \n",
"12930 2175.82 Wl 2011-01-07 11:39:25 \n",
"13105 1776.92 Wl 2011-01-07 20:34:22 \n",
"\n",
" Sender type Receiver type \n",
"147 EU RET \n",
"12759 EU RET \n",
"12905 EU RET \n",
"12930 EU RET \n",
"13105 EU RET "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# \n",
"df_suspected_thefts = df.loc[((df[\"Amount of transaction\"] < 10000) & (df['Type of transaction']=='Wl'))]\n",
"df_suspected_thefts.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ydGI-iVp8fx_",
"outputId": "1a11dc0f-71eb-4818-96a2-8362522de973"
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'check_accuracy' is not defined",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[12], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m accuracy, fpr \u001b[38;5;241m=\u001b[39m \u001b[43mcheck_accuracy\u001b[49m(df_suspected_thefts, df, [\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mF_SevWith\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mInfected accounts detection: accuracy = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00maccuracy\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, FPR = \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfpr\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m )\n",
"\u001b[1;31mNameError\u001b[0m: name 'check_accuracy' is not defined"
]
}
],
"source": [
"\n",
"accuracy, fpr = check_accuracy(df_suspected_thefts, df, ['F_SevWith'])\n",
"print(f\"Infected accounts detection: accuracy = {accuracy}, FPR = {fpr}\" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Полученная точность очень низкая, потому что не был учтен такой параметр как частота снятий, следовательно, необходимо отслеживать данный параметр, не учтено число уникальных агентов (Ret). Предлагаю, продумать, как учесть данный параметр. Доработайте! \n",
"\n",
"Тем не менее посмотрим график транзакций, которые потенциально могут быть опасными."
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 542
},
"id": "MLpk9VDk-0V5",
"outputId": "8f5f9856-b472-4b34-c7a2-4cf2f6730d81"
},
"outputs": [
{
"data": {
"text/html": [
"\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"G = create_network_for_df(df_suspected_thefts)\n",
"draw_network(G, \"Структура мошенничества\",\"Кража мобильного телефона\" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Внимательно рассмотрев этот рисунок, можно исключить тех пользователей, которые снимают деньги только у одного агента, подозрительными транзакциями будут у тех пользвоателей, которые связаны операцией снятий сразу с несколькими агентами. Кстати на этом графике хорошо видно 4 пользователя-мула из предыдущего мошенничества. Кражи не столь очевидны. Теперь построим граф в Pyvis и поэкспериментируем с настройками компоновки узлов графа на основе сил. Попробуйте задать следующим параметры алгоритма: \n",
"centralGravity: 0.2\n",
"springLength: 295\n",
"springConstant: 0\n",
"nodeDistance: 100\n",
"damping: 0.09\n",
"maxVelocity: 60\n",
"minVelocity: 0.07\n",
"solver:repulsion\n",
"И вы увидете как расположились узлы: по внешнему радиусы узлы, которые связаны только с одним агентом, внутри круга - мулы из сценария заражением ботнетом, а вот небольшие группы узлов рядом с агентами как раз и есть пользователи, у которых были украдены телефоны."
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"n = draw_pyvis_network_for_df(df_suspected_thefts, \"Mobile phone theft \")\n"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mobile phone theft.html\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n.toggle_physics(False)\n",
"#можно настраивать силы притяженяи и отталкивания\n",
"n.show_buttons(filter_=['physics'])\n",
"n.show(\"Mobile phone theft.html\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"ДАвайте сформируем список пользователей, чьи телефоны могут быть украдены."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Число пользователей, у которых были украдены телефоны: 137. В их число входят: ['PN_EU_3_8' 'PN_EU_2_41' 'PN_EU_1_328' 'PN_EU_0_1045' 'PN_EU_0_955'\n",
" 'PN_EU_0_260' 'PN_EU_1_169' 'PN_EU_1_74' 'PN_EU_1_86' 'PN_EU_1_123'\n",
" 'PN_EU_1_35' 'PN_EU_2_59' 'PN_EU_2_39' 'PN_EU_2_108' 'PN_EU_1_118'\n",
" 'PN_EU_2_88' 'PN_EU_2_38' 'PN_EU_2_71' 'PN_EU_2_116' 'PN_EU_2_136'\n",
" 'PN_EU_1_41' 'PN_EU_1_183' 'PN_EU_0_25' 'PN_EU_0_1226' 'PN_EU_0_176'\n",
" 'PN_EU_0_652' 'PN_EU_0_373' 'PN_EU_0_624' 'PN_EU_1_124' 'PN_EU_0_935'\n",
" 'PN_EU_0_171' 'PN_EU_3_15' 'PN_EU_2_155' 'PN_EU_0_763' 'PN_EU_0_590'\n",
" 'PN_EU_3_36' 'PN_EU_1_366' 'PN_EU_1_49' 'PN_EU_0_441' 'PN_EU_2_137'\n",
" 'PN_EU_3_0' 'PN_EU_0_1093' 'PN_EU_0_720' 'PN_EU_0_1173' 'PN_EU_1_10'\n",
" 'PN_EU_2_63' 'PN_EU_0_57' 'PN_EU_2_115' 'PN_EU_0_137' 'PN_EU_3_10'\n",
" 'PN_EU_1_462' 'PN_EU_0_898' 'PN_EU_0_105' 'PN_EU_0_672' 'PN_EU_0_71'\n",
" 'PN_EU_0_173' 'PN_EU_0_1156' 'PN_EU_0_1235' 'PN_EU_2_111' 'PN_EU_1_197'\n",
" 'PN_EU_1_435' 'PN_EU_0_747' 'PN_EU_1_363' 'PN_EU_0_931' 'PN_EU_0_1262'\n",
" 'PN_EU_0_1044' 'PN_EU_0_658' 'PN_EU_1_182' 'PN_EU_3_32' 'PN_EU_2_51'\n",
" 'PN_EU_2_40' 'PN_EU_1_42' 'PN_EU_1_144' 'PN_EU_2_101' 'PN_EU_1_78'\n",
" 'PN_EU_2_107' 'PN_EU_3_14' 'PN_EU_2_118' 'PN_EU_2_75' 'PN_EU_0_76'\n",
" 'PN_EU_1_141' 'PN_EU_1_160' 'PN_EU_1_24' 'PN_EU_2_27' 'PN_EU_1_195'\n",
" 'PN_EU_3_6' 'PN_EU_1_209' 'PN_EU_2_26' 'PN_EU_1_173' 'PN_EU_1_71'\n",
" 'PN_EU_0_101' 'PN_EU_2_157' 'PN_EU_1_62' 'PN_EU_2_49' 'PN_EU_2_21'\n",
" 'PN_EU_2_152' 'PN_EU_1_146' 'PN_EU_0_26' 'PN_EU_3_4' 'PN_EU_0_50'\n",
" 'PN_EU_3_17' 'PN_EU_1_147' 'PN_EU_2_44' 'PN_EU_1_208' 'PN_EU_1_158'\n",
" 'PN_EU_0_1183' 'PN_EU_0_77' 'PN_EU_2_161' 'PN_EU_0_472' 'PN_EU_1_391'\n",
" 'PN_EU_1_149' 'PN_EU_0_876' 'PN_EU_1_377' 'PN_EU_0_1114' 'PN_EU_0_16'\n",
" 'PN_EU_3_26' 'PN_EU_0_751' 'PN_EU_0_1135' 'PN_EU_0_1128' 'PN_EU_0_90'\n",
" 'PN_EU_1_156' 'PN_EU_1_179' 'PN_EU_3_22' 'PN_EU_1_19' 'PN_EU_1_106'\n",
" 'PN_EU_1_57' 'PN_EU_2_52' 'PN_EU_0_128' 'PN_EU_2_45' 'PN_EU_3_1'\n",
" 'PN_EU_0_58' 'PN_EU_1_108' 'PN_EU_1_4' 'PN_EU_1_115' 'PN_EU_0_131'\n",
" 'PN_EU_2_65' 'PN_EU_2_35']\n"
]
}
],
"source": [
"victims = df_suspected_thefts[\"User ID (sender)\"].unique()\n",
"print(f'Число пользователей, у которых были украдены телефоны: {len(victims)}. В их число входят: {victims}')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Соглаcно нашим предположениям, у нас 137 жертв кражи телефона, но на самом деле их 61! Улучшите этот показатель, и приступайте к анализу вашего варианта."
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 1
}