Unfinished

This commit is contained in:
Dmitriy Shishkov 2023-11-11 22:55:21 +03:00
parent bf10a1c9f4
commit c4fe35f29b
3 changed files with 54813 additions and 250 deletions

54031
practice4/FinFraud_unknown.csv Normal file

File diff suppressed because it is too large Load Diff

755
practice4/main.ipynb Normal file
View File

@ -0,0 +1,755 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"**Практическая работа №4**\n",
"\n",
"\n",
"# Обнаружение злоумышленников в системе мобильных денежных переводов\n",
"\n",
"_Вариант 5_\n",
"\n",
"\n",
"1) настройка окружения"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47347ab190>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0610>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0b50>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c14d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c1ed0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
"\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement scipy==1.8.1 (from versions: none)\u001b[0m\u001b[31m\n",
"\u001b[0m\u001b[31mERROR: No matching distribution found for scipy==1.8.1\u001b[0m\u001b[31m\n",
"\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: networkx==2.7.0 in ./.venv/lib64/python3.11/site-packages (2.7)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n",
"Requirement already satisfied: pyvis in ./.venv/lib64/python3.11/site-packages (0.3.2)\n",
"Requirement already satisfied: pandas in ./.venv/lib64/python3.11/site-packages (2.0.1)\n",
"Requirement already satisfied: numpy in ./.venv/lib64/python3.11/site-packages (1.24.3)\n",
"Requirement already satisfied: plotly in ./.venv/lib64/python3.11/site-packages (5.14.1)\n",
"Requirement already satisfied: ipython>=5.3.0 in ./.venv/lib64/python3.11/site-packages (from pyvis) (8.13.2)\n",
"Requirement already satisfied: jinja2>=2.9.6 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.1.2)\n",
"Requirement already satisfied: jsonpickle>=1.4.1 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.0.1)\n",
"Requirement already satisfied: networkx>=1.11 in ./.venv/lib64/python3.11/site-packages (from pyvis) (2.7)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib64/python3.11/site-packages (from pandas) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3)\n",
"Requirement already satisfied: tzdata>=2022.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3)\n",
"Requirement already satisfied: tenacity>=6.2.0 in ./.venv/lib64/python3.11/site-packages (from plotly) (8.2.2)\n",
"Requirement already satisfied: packaging in ./.venv/lib64/python3.11/site-packages (from plotly) (23.1)\n",
"Requirement already satisfied: backcall in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.2.0)\n",
"Requirement already satisfied: decorator in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.1.1)\n",
"Requirement already satisfied: jedi>=0.16 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.18.2)\n",
"Requirement already satisfied: matplotlib-inline in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.1.6)\n",
"Requirement already satisfied: pickleshare in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.7.5)\n",
"Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (3.0.38)\n",
"Requirement already satisfied: pygments>=2.4.0 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (2.15.1)\n",
"Requirement already satisfied: stack-data in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.6.2)\n",
"Requirement already satisfied: traitlets>=5 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.9.0)\n",
"Requirement already satisfied: pexpect>4.3 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (4.8.0)\n",
"Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib64/python3.11/site-packages (from jinja2>=2.9.6->pyvis) (2.1.2)\n",
"Requirement already satisfied: six>=1.5 in ./.venv/lib64/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
"Requirement already satisfied: parso<0.9.0,>=0.8.0 in ./.venv/lib64/python3.11/site-packages (from jedi>=0.16->ipython>=5.3.0->pyvis) (0.8.3)\n",
"Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib64/python3.11/site-packages (from pexpect>4.3->ipython>=5.3.0->pyvis) (0.7.0)\n",
"Requirement already satisfied: wcwidth in ./.venv/lib64/python3.11/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=5.3.0->pyvis) (0.2.6)\n",
"Requirement already satisfied: executing>=1.2.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (1.2.0)\n",
"Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (2.2.1)\n",
"Requirement already satisfied: pure-eval in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (0.2.2)\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# %pip install scipy==1.8.1\n",
"# %pip install networkx==2.7.0\n",
"# %pip install pyvis pandas numpy plotly\n",
"\n",
"from functools import reduce\n",
"from pyvis import network as net\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"import networkx as nx\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
"from plotly.offline import iplot\n",
"from IPython.display import display, HTML\n",
"\n",
"\n",
"\n",
"\n",
"#for Jupiter notebooks\n",
"import plotly.io as pio #comment for Google collab\n",
"pio.renderers.default='notebook'#comment for Google collab"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def pyvis_deepnote_show(nt):\n",
" tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name\n",
" nt.save_graph(tmp_output_filename)\n",
"\n",
" f = open(tmp_output_filename, \"r\")\n",
" display(HTML(f.read()))"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>User ID (sender)</th>\n",
" <th>User ID (receiver)</th>\n",
" <th>User account ID (sender)</th>\n",
" <th>User account ID (receiver)</th>\n",
" <th>Amount of transaction</th>\n",
" <th>Type of transaction</th>\n",
" <th>Transaction timestamp</th>\n",
" <th>Sender type</th>\n",
" <th>Receiver type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>54030</td>\n",
" <td>54030</td>\n",
" <td>54030</td>\n",
" <td>54030</td>\n",
" <td>54030.0</td>\n",
" <td>54030</td>\n",
" <td>54030</td>\n",
" <td>54030</td>\n",
" <td>54030</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>1861</td>\n",
" <td>1562</td>\n",
" <td>1861</td>\n",
" <td>1562</td>\n",
" <td></td>\n",
" <td>5</td>\n",
" <td>46394</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>PN_Ret4</td>\n",
" <td>operator</td>\n",
" <td>RAcc4</td>\n",
" <td>A0</td>\n",
" <td></td>\n",
" <td>ArRC</td>\n",
" <td>08.07.2011 15:16</td>\n",
" <td>EU</td>\n",
" <td>operator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>2256</td>\n",
" <td>27901</td>\n",
" <td>2256</td>\n",
" <td>27901</td>\n",
" <td></td>\n",
" <td>27901</td>\n",
" <td>5</td>\n",
" <td>41246</td>\n",
" <td>27901</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>53083.47221</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>85834.97052</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0.0</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2158.2525</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>6257.375</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>76821.9675</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1053512.86</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" User ID (sender) User ID (receiver) User account ID (sender) \n",
"count 54030 54030 54030 \\\n",
"unique 1861 1562 1861 \n",
"top PN_Ret4 operator RAcc4 \n",
"freq 2256 27901 2256 \n",
"mean \n",
"std \n",
"min \n",
"25% \n",
"50% \n",
"75% \n",
"max \n",
"\n",
" User account ID (receiver) Amount of transaction Type of transaction \n",
"count 54030 54030.0 54030 \\\n",
"unique 1562 5 \n",
"top A0 ArRC \n",
"freq 27901 27901 \n",
"mean 53083.47221 \n",
"std 85834.97052 \n",
"min 0.0 \n",
"25% 2158.2525 \n",
"50% 6257.375 \n",
"75% 76821.9675 \n",
"max 1053512.86 \n",
"\n",
" Transaction timestamp Sender type Receiver type \n",
"count 54030 54030 54030 \n",
"unique 46394 2 4 \n",
"top 08.07.2011 15:16 EU operator \n",
"freq 5 41246 27901 \n",
"mean \n",
"std \n",
"min \n",
"25% \n",
"50% \n",
"75% \n",
"max "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('./FinFraud_unknown.csv', sep=',', parse_dates=[15, 16, 21])\n",
"\n",
"df.columns = [\n",
" 'User ID (sender)', \n",
" 'User ID (receiver)',\n",
" 'User account ID (sender)',\n",
" 'User account ID (receiver)',\n",
" 'Amount of transaction',\n",
" 'Type of transaction',\n",
" 'State of operation',\n",
" 'Balance before (sender)',\n",
" 'Balance after (sender)',\n",
" 'Balance after (receiver)',\n",
" 'Balance before (receiver)', \n",
" 'Not used',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Transaction timestamp (sender)',\n",
" 'Transaction timestamp (receiver)',\n",
" 'Sender account ID',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Not used',\n",
" 'Transaction timestamp',\n",
" 'Sender type',\n",
" 'Receiver type'\n",
"]\n",
"df = df.loc[:, ~df.columns.str.contains('^Not used', case=False)].sort_values('Transaction timestamp') \n",
"df = df.drop('State of operation', axis=1)\n",
"df = df.drop('Sender account ID', axis=1)\n",
"df = df.drop('Transaction timestamp (sender)', axis=1)\n",
"df = df.drop('Transaction timestamp (receiver)', axis=1)\n",
"df = df.drop('Balance before (sender)', axis=1)\n",
"df = df.drop('Balance after (sender)', axis=1)\n",
"df = df.drop('Balance before (receiver)', axis=1)\n",
"df = df.drop('Balance after (receiver)', axis=1)\n",
"\n",
"df[\"Amount of transaction\"] = pd.to_numeric(df[\"Amount of transaction\"], errors='coerce').fillna(0)\n",
"\n",
"\n",
"df.describe(include='all').fillna('')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Описание набора данных"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"| Название столбца | Возможные значения |Описание |\n",
"|----------------------------------------|----------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n",
"| User ID (transaction sender) | Generated ID | |\n",
"| User ID (transaction receiver) | Generated ID | |\n",
"| User account ID (transaction sender) | Generated ID | |\n",
"| User account ID (transaction receiver) | Generated ID | |\n",
"| Amount of transaction | Number | |\n",
"| Type of transaction | `Ind`<br/>`Dt`<br/>`ArRC`<br/>`Wl`<br/>`Merchant` | Тип транзакции <br/>`Ind` денежный перевод между пользователями системы <br/>`Dt` пополнение электронного кошелька (отправитель агент, а получатель - пользователь системы)<br/>`ArRC` пополнение счета мобильной связи (перевод от пользователя системы к оператору мобильной связи )<br/>`Wl` снятие электронных денег (отправитель - пользователь системы, получатель - оператор)<br/>`Merchant` перевод от пользователя поставщику услуг или товаров |\n",
"| State of operation | `SU` | `SU` успешно |\n",
"| Balance before (transaction sender) | Number | |\n",
"| Balance before (transaction receiver) | Number | |\n",
"| Balance after (transaction sender) | Number | |\n",
"| Balance after (transaction receiver) | Number | |\n",
"| Transaction timestamp (sender) | Datetime | |\n",
"| Transaction timestamp (receiver) | Datetime | |\n",
"| Sender account ID | Generated ID | |\n",
"| Transaction timestamp | Datetime | |\n",
"| Sender type | `EU`<br/>`RET` | |\n",
"| Receiver type | `EU`<br/>`operator`<br/>`RET`<br/>`MER` | |\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Поскольку поле `State of operation` всегда имеет значение (`SU`) для всех транзакций, данный столбец предлагается удалить. \n",
"Столбцы `Sender account ID` и `User ID (transaction sender)` идентичны, также столбцы `Transaction timestamp (sender)` и `Transaction timestamp (receiver)` идентичны стобцу `Transaction timestamp`, поэтому данные стобцы удалются (остается только `Transaction timestamp`). Также удаляюся столбцы с балансом, т.к. в текущей версии набора данных они не задействованы."
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"User ID (sender) object\n",
"User ID (receiver) object\n",
"User account ID (sender) object\n",
"User account ID (receiver) object\n",
"Amount of transaction float64\n",
"Type of transaction object\n",
"Transaction timestamp object\n",
"Sender type object\n",
"Receiver type object\n",
"dtype: object"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Статистика транзакций для каждого пользователя\n",
"\n",
"Традиционно начнем со статистического анализа данных. Рекомендуется расширить число рассчитываемых статистик, например, включив показатели, характеризующие частоту транзакций. Для такого вида мошенничества как кража телефона изменение частоты снятий является характерным признаком."
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"def init_stat_dict():\n",
" stat_dict = dict()\n",
" transaction_types = {\"Ind\", \"Wl\", \"Dt\", \"Merchant\", \"ArRC\"} \n",
" for tran_type in transaction_types:\n",
" amount_name = f\"Sent_amount_{tran_type}\"\n",
" amount_median = f\"Sent_amount_{tran_type}_median\"\n",
" amount_min = f\"Sent_amount_{tran_type}_min\"\n",
" amount_max = f\"Sent_amount_{tran_type}_max\"\n",
" tran_count = f\"Sent_{tran_type}_count\"\n",
" rec_amount_name = f\"Received_amount_{tran_type}\"\n",
" rec_amount_median = f\"Received_amount_{tran_type}_median\"\n",
" rec_amount_min = f\"Received_amount_{tran_type}_min\"\n",
" rec_amount_max = f\"Received_amount_{tran_type}_max\"\n",
" rec_tran_count = f\"Received_{tran_type}_count\"\n",
" \n",
" stat_dict[amount_name] = 0\n",
" stat_dict[amount_median] = 0\n",
" stat_dict[amount_min] = 0\n",
" stat_dict[amount_max] = 0\n",
" stat_dict[tran_count] = 0\n",
" stat_dict[rec_amount_name] = 0\n",
" stat_dict[rec_amount_median] = 0\n",
" stat_dict[rec_amount_min] = 0\n",
" stat_dict[rec_amount_max] = 0\n",
" stat_dict[rec_tran_count] = 0\n",
"\n",
" return stat_dict\n",
"\n",
"\n",
"def get_stat_df(df):\n",
" sent_unique_users = df[\"User ID (sender)\"].unique()\n",
" received_unique_users = df[\"User ID (receiver)\"].unique()\n",
" unique_users = np.unique(np.concatenate((sent_unique_users,received_unique_users),0))\n",
" print(unique_users)\n",
" stat_df = pd.DataFrame()\n",
" stat_dict = init_stat_dict()\n",
" transaction_types = {\"Ind\", \"Wl\", \"Dt\", \"Merchant\", \"ArRC\"}\n",
" for user in unique_users:\n",
" stat_dict = init_stat_dict() \n",
" stat_dict[\"User ID\"] = user\n",
"\n",
" user_df = df.loc[(df[\"User ID (sender)\"] == user)]\n",
" \n",
" if (not user_df.empty):\n",
" #stat_dict[\"User ID\"] = user\n",
" \n",
" stat_dict[\"Unique_receivers\"] = len(user_df[\"User ID (receiver)\"].unique())\n",
" stat_dict[\"User type\"] = user_df[\"Sender type\"].unique()[0]\n",
"\n",
" for tran_type in transaction_types:\n",
" amount_name = f\"Sent_amount_{tran_type}\"\n",
" amount_median = f\"Sent_amount_{tran_type}_median\"\n",
" amount_min = f\"Sent_amount_{tran_type}_min\"\n",
" amount_max = f\"Sent_amount_{tran_type}_max\"\n",
" tran_count = f\"Sent_{tran_type}_count\"\n",
" stat_dict[amount_name] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].sum()\n",
" stat_dict[amount_median] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].mean()\n",
" stat_dict[amount_min] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].min()\n",
" stat_dict[amount_max] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].max()\n",
" stat_dict[tran_count] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].count()\n",
" else:\n",
" stat_dict[\"User type\"] = (df.loc[(df[\"User ID (receiver)\"]==user)])[\"Receiver type\"].unique()[0]\n",
"\n",
" user_df = df.loc[(df[\"User ID (receiver)\"] == user)]\n",
" if (not user_df.empty):\n",
" stat_dict[\"Unique_senders\"] = len(user_df[\"User ID (sender)\"].unique())\n",
" for tran_type in transaction_types:\n",
" rec_amount_name = f\"Received_amount_{tran_type}\"\n",
" rec_amount_median = f\"Received_amount_{tran_type}_median\"\n",
" rec_amount_min = f\"Received_amount_{tran_type}_min\"\n",
" rec_amount_max = f\"Received_amount_{tran_type}_max\"\n",
" rec_tran_count = f\"Received_{tran_type}_count\"\n",
" stat_dict[rec_amount_name] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].sum()\n",
" stat_dict[rec_amount_median] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].median()\n",
" stat_dict[rec_amount_min] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].min()\n",
" stat_dict[rec_amount_max] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].max()\n",
" stat_dict[rec_tran_count] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].count()\n",
" \n",
" df_temp = pd.DataFrame([stat_dict])\n",
" \n",
" #df_temp.head()\n",
" stat_df = pd.concat([stat_df, df_temp])\n",
" stat_df = stat_df.fillna(0)\n",
" return stat_df\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Кстати, обратите внимание уникальных пользователей в системе 2009. Это больше, чем число уникальных отправителей и уникальных получателей, значит, какие то пользователи только отправляют деньги, а какие-то только получают."
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['PN_EU_0_0' 'PN_EU_0_1' 'PN_EU_0_10' ... 'PN_Ret5' 'PN_Ret6' 'operator']\n",
"(2009, 54)\n",
" Sent_amount_Wl Sent_amount_Wl_median Sent_amount_Wl_min \n",
"0 0.0 0.0 0.0 \\\n",
"0 0.0 0.0 0.0 \n",
"0 0.0 0.0 0.0 \n",
"0 0.0 0.0 0.0 \n",
"0 0.0 0.0 0.0 \n",
"\n",
" Sent_amount_Wl_max Sent_Wl_count Received_amount_Wl \n",
"0 0.0 0 0.0 \\\n",
"0 0.0 0 0.0 \n",
"0 0.0 0 0.0 \n",
"0 0.0 0 0.0 \n",
"0 0.0 0 0.0 \n",
"\n",
" Received_amount_Wl_median Received_amount_Wl_min Received_amount_Wl_max \n",
"0 0.0 0.0 0.0 \\\n",
"0 0.0 0.0 0.0 \n",
"0 0.0 0.0 0.0 \n",
"0 0.0 0.0 0.0 \n",
"0 0.0 0.0 0.0 \n",
"\n",
" Received_Wl_count ... Sent_Dt_count Received_amount_Dt \n",
"0 0 ... 0 686643.36 \\\n",
"0 0 ... 0 483467.30 \n",
"0 0 ... 0 0.00 \n",
"0 0 ... 0 0.00 \n",
"0 0 ... 0 0.00 \n",
"\n",
" Received_amount_Dt_median Received_amount_Dt_min Received_amount_Dt_max \n",
"0 27845.615 15965.17 41729.94 \\\n",
"0 35925.855 8067.95 86422.48 \n",
"0 0.000 0.00 0.00 \n",
"0 0.000 0.00 0.00 \n",
"0 0.000 0.00 0.00 \n",
"\n",
" Received_Dt_count User ID User type Unique_senders \n",
"0 24 PN_EU_0_0 EU 2.0 \\\n",
"0 12 PN_EU_0_1 EU 6.0 \n",
"0 0 PN_EU_0_10 EU 2.0 \n",
"0 0 PN_EU_0_100 EU 1.0 \n",
"0 0 PN_EU_0_1000 EU 0.0 \n",
"\n",
" Unique_receivers \n",
"0 0.0 \n",
"0 0.0 \n",
"0 2.0 \n",
"0 1.0 \n",
"0 1.0 \n",
"\n",
"[5 rows x 54 columns]\n"
]
}
],
"source": [
"stat_df = get_stat_df(df)\n",
"print(stat_df.shape)\n",
"# print(stat_df.head())\n",
"\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Была выбрана часть статистик и построила проекции пользователей. Анализируемые поля были выбраны на основе анализа свойств возможных финансовых аномалий (т.е. просто эвристически:))."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'sklearn'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[47], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpandas\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mplotting\u001b[39;00m \u001b[39mimport\u001b[39;00m scatter_matrix\n\u001b[0;32m----> 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpreprocessing\u001b[39;00m \u001b[39mimport\u001b[39;00m StandardScaler\n\u001b[1;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpreprocessing\u001b[39;00m \u001b[39mimport\u001b[39;00m LabelEncoder\n\u001b[1;32m 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdecomposition\u001b[39;00m \u001b[39mimport\u001b[39;00m PCA\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'"
]
}
],
"source": [
"from pandas.plotting import scatter_matrix\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.decomposition import PCA\n",
"from matplotlib.ticker import FormatStrFormatter\n",
"import plotly.express as px"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Мошенничество, связанное с заражением бот-сетью."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"\n",
"Согласно описанию сценария атаки: есть множество зараженных пользователей, которые переводят деньги какому-то пользователю (\"ослу\" или \"мулу\"), и уже он выполняет операции обналичивания денег. Рассмотрен простейщий вариант сценария: цепочка мулов состоит из одного звена. "
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'StandardScaler' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[48], line 11\u001b[0m\n\u001b[1;32m 8\u001b[0m x \u001b[39m=\u001b[39m stat_df[MobileBot_labels]\u001b[39m.\u001b[39mvalues\n\u001b[1;32m 10\u001b[0m \u001b[39m# нормализуем значения\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m x \u001b[39m=\u001b[39m StandardScaler()\u001b[39m.\u001b[39mfit_transform(x)\n\u001b[1;32m 13\u001b[0m pca \u001b[39m=\u001b[39m PCA(n_components\u001b[39m=\u001b[39m\u001b[39m3\u001b[39m)\n\u001b[1;32m 14\u001b[0m principalComponents \u001b[39m=\u001b[39m pca\u001b[39m.\u001b[39mfit_transform(x)\n",
"\u001b[0;31mNameError\u001b[0m: name 'StandardScaler' is not defined"
]
}
],
"source": [
"\n",
"#оставляем поля, связанные с переводами и снятиями и добавили число уникальных пользователей, это же бот сеть.\n",
"\n",
"MobileBot_labels = ['Unique_receivers','Unique_receivers','Sent_Ind_count' ,'Sent_Wl_count', 'Received_Ind_count']\n",
"\n",
"# а по этим полям будем пробовать найти пользователей с кражей телефона.\n",
"MobileTheft_labels = ['Sent_amount_Wl', 'Sent_amount_Wl_median', 'Sent_amount_Wl_min', 'Sent_amount_Wl_max', 'Sent_Wl_count']\n",
"\n",
"x = stat_df[MobileBot_labels].values\n",
"\n",
"# нормализуем значения\n",
"x = StandardScaler().fit_transform(x)\n",
"\n",
"pca = PCA(n_components=3)\n",
"principalComponents = pca.fit_transform(x)\n",
"print(f'Explained variance: {pca.explained_variance_ratio_}\\tSum: {pca.explained_variance_ratio_.sum()}')\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -60,12 +60,12 @@
} }
], ],
"source": [ "source": [
"!pip install --user scipy==1.8.1\n", "%pip install --user scipy==1.8.1\n",
"!pip install --user networkx==2.7.0\n", "%pip install --user networkx==2.7.0\n",
"#uncomment when running in Google Collab\n", "#uncomment when running in Google Collab\n",
"#!apt install python3-dev graphviz libgraphviz-dev pkg-config\n", "#!apt install python3-dev graphviz libgraphviz-dev pkg-config\n",
"#!pip install pygraphviz\n", "#!pip install pygraphviz\n",
"!pip install pyvis\n", "%pip install pyvis\n",
"\n", "\n",
"import zipfile\n", "import zipfile\n",
"import itertools\n", "import itertools\n",
@ -108,7 +108,7 @@
" require.undef(\"plotly\");\n", " require.undef(\"plotly\");\n",
" requirejs.config({\n", " requirejs.config({\n",
" paths: {\n", " paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-2.12.1.min']\n", " 'plotly': ['https://cdn.plot.ly/plotly-2.20.0.min']\n",
" }\n", " }\n",
" });\n", " });\n",
" require(['plotly'], function(Plotly) {\n", " require(['plotly'], function(Plotly) {\n",
@ -162,7 +162,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 3, "execution_count": 2,
"metadata": { "metadata": {
"colab": { "colab": {
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
@ -172,10 +172,15 @@
}, },
"outputs": [ "outputs": [
{ {
"name": "stdout", "ename": "FileNotFoundError",
"output_type": "stream", "evalue": "[Errno 2] No such file or directory: 'C:\\\\Practice\\\\data.zip'",
"text": [ "output_type": "error",
"['20130619.logDWH.complex.csv']\n" "traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m zip_filepath\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mC:\u001b[39m\u001b[39m\\\u001b[39m\u001b[39mPractice\u001b[39m\u001b[39m\\\u001b[39m\u001b[39mdata.zip\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mwith\u001b[39;00m zipfile\u001b[39m.\u001b[39;49mZipFile(zip_filepath) \u001b[39mas\u001b[39;00m z:\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m(z\u001b[39m.\u001b[39mnamelist())\n\u001b[1;32m 5\u001b[0m \u001b[39mfor\u001b[39;00m name \u001b[39min\u001b[39;00m z\u001b[39m.\u001b[39mnamelist():\n",
"File \u001b[0;32m/usr/lib64/python3.11/zipfile.py:1283\u001b[0m, in \u001b[0;36mZipFile.__init__\u001b[0;34m(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps, metadata_encoding)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 1282\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1283\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfp \u001b[39m=\u001b[39m io\u001b[39m.\u001b[39;49mopen(file, filemode)\n\u001b[1;32m 1284\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mOSError\u001b[39;00m:\n\u001b[1;32m 1285\u001b[0m \u001b[39mif\u001b[39;00m filemode \u001b[39min\u001b[39;00m modeDict:\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:\\\\Practice\\\\data.zip'"
] ]
} }
], ],
@ -219,7 +224,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 4,
"metadata": { "metadata": {
"colab": { "colab": {
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
@ -229,247 +234,19 @@
}, },
"outputs": [ "outputs": [
{ {
"data": { "ename": "TypeError",
"text/html": [ "evalue": "NDFrame.describe() got an unexpected keyword argument 'datetime_is_numeric'",
"<div>\n", "output_type": "error",
"<style scoped>\n", "traceback": [
" .dataframe tbody tr th:only-of-type {\n", "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
" vertical-align: middle;\n", "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
" }\n", "Cell \u001b[0;32mIn[4], line 44\u001b[0m\n\u001b[1;32m 38\u001b[0m df \u001b[39m=\u001b[39m df\u001b[39m.\u001b[39mdrop(\u001b[39m'\u001b[39m\u001b[39mBalance after (receiver)\u001b[39m\u001b[39m'\u001b[39m, axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[1;32m 43\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mGroundtruth\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m df[\u001b[39m'\u001b[39m\u001b[39mGroundtruth\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m.\u001b[39mstr\u001b[39m.\u001b[39mreplace(\u001b[39m'\u001b[39m\u001b[39m-\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m_\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 44\u001b[0m df\u001b[39m.\u001b[39;49mdescribe(include\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mall\u001b[39;49m\u001b[39m'\u001b[39;49m, datetime_is_numeric\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\u001b[39m.\u001b[39mfillna(\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)\n",
"\n", "\u001b[0;31mTypeError\u001b[0m: NDFrame.describe() got an unexpected keyword argument 'datetime_is_numeric'"
" .dataframe tbody tr th {\n", ]
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Groundtruth</th>\n",
" <th>User ID (sender)</th>\n",
" <th>User ID (receiver)</th>\n",
" <th>User account ID (sender)</th>\n",
" <th>User account ID (receiver)</th>\n",
" <th>Amount of transaction</th>\n",
" <th>Type of transaction</th>\n",
" <th>Transaction timestamp</th>\n",
" <th>Sender type</th>\n",
" <th>Receiver type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>54848</td>\n",
" <td>54848</td>\n",
" <td>54848</td>\n",
" <td>54848</td>\n",
" <td>54848</td>\n",
" <td>54848.0</td>\n",
" <td>54848</td>\n",
" <td>54848</td>\n",
" <td>54848</td>\n",
" <td>54848</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>8</td>\n",
" <td>1868</td>\n",
" <td>1536</td>\n",
" <td>1868</td>\n",
" <td>1536</td>\n",
" <td></td>\n",
" <td>5</td>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>N_Reg_RC</td>\n",
" <td>PN_Ret5</td>\n",
" <td>operator</td>\n",
" <td>RAcc5</td>\n",
" <td>A0</td>\n",
" <td></td>\n",
" <td>ArRC</td>\n",
" <td></td>\n",
" <td>EU</td>\n",
" <td>operator</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>28312</td>\n",
" <td>2265</td>\n",
" <td>28312</td>\n",
" <td>2265</td>\n",
" <td>28312</td>\n",
" <td></td>\n",
" <td>28312</td>\n",
" <td></td>\n",
" <td>41981</td>\n",
" <td>28312</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>55101.369953</td>\n",
" <td></td>\n",
" <td>2011-07-22 23:58:30.741376256</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0.32</td>\n",
" <td></td>\n",
" <td>2011-01-06 00:09:01</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2320.885</td>\n",
" <td></td>\n",
" <td>2011-06-20 20:11:10.500000</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>6796.69</td>\n",
" <td></td>\n",
" <td>2011-07-28 20:56:54</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>82111.76</td>\n",
" <td></td>\n",
" <td>2011-09-09 22:22:44.500000</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1148351.48</td>\n",
" <td></td>\n",
" <td>2011-12-09 23:54:57</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>87307.646401</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Groundtruth User ID (sender) User ID (receiver) \\\n",
"count 54848 54848 54848 \n",
"unique 8 1868 1536 \n",
"top N_Reg_RC PN_Ret5 operator \n",
"freq 28312 2265 28312 \n",
"mean \n",
"min \n",
"25% \n",
"50% \n",
"75% \n",
"max \n",
"std \n",
"\n",
" User account ID (sender) User account ID (receiver) \\\n",
"count 54848 54848 \n",
"unique 1868 1536 \n",
"top RAcc5 A0 \n",
"freq 2265 28312 \n",
"mean \n",
"min \n",
"25% \n",
"50% \n",
"75% \n",
"max \n",
"std \n",
"\n",
" Amount of transaction Type of transaction \\\n",
"count 54848.0 54848 \n",
"unique 5 \n",
"top ArRC \n",
"freq 28312 \n",
"mean 55101.369953 \n",
"min 0.32 \n",
"25% 2320.885 \n",
"50% 6796.69 \n",
"75% 82111.76 \n",
"max 1148351.48 \n",
"std 87307.646401 \n",
"\n",
" Transaction timestamp Sender type Receiver type \n",
"count 54848 54848 54848 \n",
"unique 2 4 \n",
"top EU operator \n",
"freq 41981 28312 \n",
"mean 2011-07-22 23:58:30.741376256 \n",
"min 2011-01-06 00:09:01 \n",
"25% 2011-06-20 20:11:10.500000 \n",
"50% 2011-07-28 20:56:54 \n",
"75% 2011-09-09 22:22:44.500000 \n",
"max 2011-12-09 23:54:57 \n",
"std "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
} }
], ],
"source": [ "source": [
"df = pd.read_csv('/practice/FinFraud_Labelled.csv', sep='|', parse_dates=[16, 17, 22])\n", "df = pd.read_csv('./FinFraud_Labelled.csv', sep='|', parse_dates=[16, 17, 22])\n",
"# в файлах с вариантом задания, разделитель - \";\" \n", "# в файлах с вариантом задания, разделитель - \";\" \n",
"df.columns = [\n", "df.columns = [\n",
" 'Groundtruth', \n", " 'Groundtruth', \n",
@ -512,7 +289,7 @@
"\n", "\n",
"\n", "\n",
"df['Groundtruth'] = df['Groundtruth'].str.replace('-', '_')\n", "df['Groundtruth'] = df['Groundtruth'].str.replace('-', '_')\n",
"df.describe(include='all', datetime_is_numeric=True).fillna('')" "df.describe(include='all').fillna('')"
] ]
}, },
{ {
@ -2840,7 +2617,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.10.9" "version": "3.11.3"
} }
}, },
"nbformat": 4, "nbformat": 4,