Unfinished
This commit is contained in:
parent
bf10a1c9f4
commit
c4fe35f29b
54031
practice4/FinFraud_unknown.csv
Normal file
54031
practice4/FinFraud_unknown.csv
Normal file
File diff suppressed because it is too large
Load Diff
755
practice4/main.ipynb
Normal file
755
practice4/main.ipynb
Normal file
@ -0,0 +1,755 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Практическая работа №4**\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Обнаружение злоумышленников в системе мобильных денежных переводов\n",
|
||||
"\n",
|
||||
"_Вариант 5_\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"1) настройка окружения"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\u001b[33mWARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47347ab190>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0610>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0b50>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c14d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[33mWARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c1ed0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement scipy==1.8.1 (from versions: none)\u001b[0m\u001b[31m\n",
|
||||
"\u001b[0m\u001b[31mERROR: No matching distribution found for scipy==1.8.1\u001b[0m\u001b[31m\n",
|
||||
"\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n",
|
||||
"Requirement already satisfied: networkx==2.7.0 in ./.venv/lib64/python3.11/site-packages (2.7)\n",
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n",
|
||||
"Requirement already satisfied: pyvis in ./.venv/lib64/python3.11/site-packages (0.3.2)\n",
|
||||
"Requirement already satisfied: pandas in ./.venv/lib64/python3.11/site-packages (2.0.1)\n",
|
||||
"Requirement already satisfied: numpy in ./.venv/lib64/python3.11/site-packages (1.24.3)\n",
|
||||
"Requirement already satisfied: plotly in ./.venv/lib64/python3.11/site-packages (5.14.1)\n",
|
||||
"Requirement already satisfied: ipython>=5.3.0 in ./.venv/lib64/python3.11/site-packages (from pyvis) (8.13.2)\n",
|
||||
"Requirement already satisfied: jinja2>=2.9.6 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.1.2)\n",
|
||||
"Requirement already satisfied: jsonpickle>=1.4.1 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.0.1)\n",
|
||||
"Requirement already satisfied: networkx>=1.11 in ./.venv/lib64/python3.11/site-packages (from pyvis) (2.7)\n",
|
||||
"Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib64/python3.11/site-packages (from pandas) (2.8.2)\n",
|
||||
"Requirement already satisfied: pytz>=2020.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3)\n",
|
||||
"Requirement already satisfied: tzdata>=2022.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3)\n",
|
||||
"Requirement already satisfied: tenacity>=6.2.0 in ./.venv/lib64/python3.11/site-packages (from plotly) (8.2.2)\n",
|
||||
"Requirement already satisfied: packaging in ./.venv/lib64/python3.11/site-packages (from plotly) (23.1)\n",
|
||||
"Requirement already satisfied: backcall in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.2.0)\n",
|
||||
"Requirement already satisfied: decorator in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.1.1)\n",
|
||||
"Requirement already satisfied: jedi>=0.16 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.18.2)\n",
|
||||
"Requirement already satisfied: matplotlib-inline in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.1.6)\n",
|
||||
"Requirement already satisfied: pickleshare in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.7.5)\n",
|
||||
"Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (3.0.38)\n",
|
||||
"Requirement already satisfied: pygments>=2.4.0 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (2.15.1)\n",
|
||||
"Requirement already satisfied: stack-data in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.6.2)\n",
|
||||
"Requirement already satisfied: traitlets>=5 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.9.0)\n",
|
||||
"Requirement already satisfied: pexpect>4.3 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (4.8.0)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib64/python3.11/site-packages (from jinja2>=2.9.6->pyvis) (2.1.2)\n",
|
||||
"Requirement already satisfied: six>=1.5 in ./.venv/lib64/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)\n",
|
||||
"Requirement already satisfied: parso<0.9.0,>=0.8.0 in ./.venv/lib64/python3.11/site-packages (from jedi>=0.16->ipython>=5.3.0->pyvis) (0.8.3)\n",
|
||||
"Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib64/python3.11/site-packages (from pexpect>4.3->ipython>=5.3.0->pyvis) (0.7.0)\n",
|
||||
"Requirement already satisfied: wcwidth in ./.venv/lib64/python3.11/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=5.3.0->pyvis) (0.2.6)\n",
|
||||
"Requirement already satisfied: executing>=1.2.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (1.2.0)\n",
|
||||
"Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (2.2.1)\n",
|
||||
"Requirement already satisfied: pure-eval in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (0.2.2)\n",
|
||||
"\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.1.2\u001b[0m\n",
|
||||
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
|
||||
"Note: you may need to restart the kernel to use updated packages.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# %pip install scipy==1.8.1\n",
|
||||
"# %pip install networkx==2.7.0\n",
|
||||
"# %pip install pyvis pandas numpy plotly\n",
|
||||
"\n",
|
||||
"from functools import reduce\n",
|
||||
"from pyvis import network as net\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"import networkx as nx\n",
|
||||
"import plotly.express as px\n",
|
||||
"import plotly.graph_objects as go\n",
|
||||
"\n",
|
||||
"from plotly.offline import iplot\n",
|
||||
"from IPython.display import display, HTML\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#for Jupiter notebooks\n",
|
||||
"import plotly.io as pio #comment for Google collab\n",
|
||||
"pio.renderers.default='notebook'#comment for Google collab"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def pyvis_deepnote_show(nt):\n",
|
||||
" tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name\n",
|
||||
" nt.save_graph(tmp_output_filename)\n",
|
||||
"\n",
|
||||
" f = open(tmp_output_filename, \"r\")\n",
|
||||
" display(HTML(f.read()))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>User ID (sender)</th>\n",
|
||||
" <th>User ID (receiver)</th>\n",
|
||||
" <th>User account ID (sender)</th>\n",
|
||||
" <th>User account ID (receiver)</th>\n",
|
||||
" <th>Amount of transaction</th>\n",
|
||||
" <th>Type of transaction</th>\n",
|
||||
" <th>Transaction timestamp</th>\n",
|
||||
" <th>Sender type</th>\n",
|
||||
" <th>Receiver type</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>54030</td>\n",
|
||||
" <td>54030</td>\n",
|
||||
" <td>54030</td>\n",
|
||||
" <td>54030</td>\n",
|
||||
" <td>54030.0</td>\n",
|
||||
" <td>54030</td>\n",
|
||||
" <td>54030</td>\n",
|
||||
" <td>54030</td>\n",
|
||||
" <td>54030</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unique</th>\n",
|
||||
" <td>1861</td>\n",
|
||||
" <td>1562</td>\n",
|
||||
" <td>1861</td>\n",
|
||||
" <td>1562</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>46394</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>top</th>\n",
|
||||
" <td>PN_Ret4</td>\n",
|
||||
" <td>operator</td>\n",
|
||||
" <td>RAcc4</td>\n",
|
||||
" <td>A0</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>ArRC</td>\n",
|
||||
" <td>08.07.2011 15:16</td>\n",
|
||||
" <td>EU</td>\n",
|
||||
" <td>operator</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>freq</th>\n",
|
||||
" <td>2256</td>\n",
|
||||
" <td>27901</td>\n",
|
||||
" <td>2256</td>\n",
|
||||
" <td>27901</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>27901</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>41246</td>\n",
|
||||
" <td>27901</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>53083.47221</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>85834.97052</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2158.2525</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>6257.375</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>76821.9675</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>1053512.86</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" User ID (sender) User ID (receiver) User account ID (sender) \n",
|
||||
"count 54030 54030 54030 \\\n",
|
||||
"unique 1861 1562 1861 \n",
|
||||
"top PN_Ret4 operator RAcc4 \n",
|
||||
"freq 2256 27901 2256 \n",
|
||||
"mean \n",
|
||||
"std \n",
|
||||
"min \n",
|
||||
"25% \n",
|
||||
"50% \n",
|
||||
"75% \n",
|
||||
"max \n",
|
||||
"\n",
|
||||
" User account ID (receiver) Amount of transaction Type of transaction \n",
|
||||
"count 54030 54030.0 54030 \\\n",
|
||||
"unique 1562 5 \n",
|
||||
"top A0 ArRC \n",
|
||||
"freq 27901 27901 \n",
|
||||
"mean 53083.47221 \n",
|
||||
"std 85834.97052 \n",
|
||||
"min 0.0 \n",
|
||||
"25% 2158.2525 \n",
|
||||
"50% 6257.375 \n",
|
||||
"75% 76821.9675 \n",
|
||||
"max 1053512.86 \n",
|
||||
"\n",
|
||||
" Transaction timestamp Sender type Receiver type \n",
|
||||
"count 54030 54030 54030 \n",
|
||||
"unique 46394 2 4 \n",
|
||||
"top 08.07.2011 15:16 EU operator \n",
|
||||
"freq 5 41246 27901 \n",
|
||||
"mean \n",
|
||||
"std \n",
|
||||
"min \n",
|
||||
"25% \n",
|
||||
"50% \n",
|
||||
"75% \n",
|
||||
"max "
|
||||
]
|
||||
},
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.read_csv('./FinFraud_unknown.csv', sep=',', parse_dates=[15, 16, 21])\n",
|
||||
"\n",
|
||||
"df.columns = [\n",
|
||||
" 'User ID (sender)', \n",
|
||||
" 'User ID (receiver)',\n",
|
||||
" 'User account ID (sender)',\n",
|
||||
" 'User account ID (receiver)',\n",
|
||||
" 'Amount of transaction',\n",
|
||||
" 'Type of transaction',\n",
|
||||
" 'State of operation',\n",
|
||||
" 'Balance before (sender)',\n",
|
||||
" 'Balance after (sender)',\n",
|
||||
" 'Balance after (receiver)',\n",
|
||||
" 'Balance before (receiver)', \n",
|
||||
" 'Not used',\n",
|
||||
" 'Not used',\n",
|
||||
" 'Not used',\n",
|
||||
" 'Not used',\n",
|
||||
" 'Transaction timestamp (sender)',\n",
|
||||
" 'Transaction timestamp (receiver)',\n",
|
||||
" 'Sender account ID',\n",
|
||||
" 'Not used',\n",
|
||||
" 'Not used',\n",
|
||||
" 'Not used',\n",
|
||||
" 'Transaction timestamp',\n",
|
||||
" 'Sender type',\n",
|
||||
" 'Receiver type'\n",
|
||||
"]\n",
|
||||
"df = df.loc[:, ~df.columns.str.contains('^Not used', case=False)].sort_values('Transaction timestamp') \n",
|
||||
"df = df.drop('State of operation', axis=1)\n",
|
||||
"df = df.drop('Sender account ID', axis=1)\n",
|
||||
"df = df.drop('Transaction timestamp (sender)', axis=1)\n",
|
||||
"df = df.drop('Transaction timestamp (receiver)', axis=1)\n",
|
||||
"df = df.drop('Balance before (sender)', axis=1)\n",
|
||||
"df = df.drop('Balance after (sender)', axis=1)\n",
|
||||
"df = df.drop('Balance before (receiver)', axis=1)\n",
|
||||
"df = df.drop('Balance after (receiver)', axis=1)\n",
|
||||
"\n",
|
||||
"df[\"Amount of transaction\"] = pd.to_numeric(df[\"Amount of transaction\"], errors='coerce').fillna(0)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"df.describe(include='all').fillna('')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Описание набора данных"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"| Название столбца | Возможные значения |Описание |\n",
|
||||
"|----------------------------------------|----------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n",
|
||||
"| User ID (transaction sender) | Generated ID | |\n",
|
||||
"| User ID (transaction receiver) | Generated ID | |\n",
|
||||
"| User account ID (transaction sender) | Generated ID | |\n",
|
||||
"| User account ID (transaction receiver) | Generated ID | |\n",
|
||||
"| Amount of transaction | Number | |\n",
|
||||
"| Type of transaction | `Ind`<br/>`Dt`<br/>`ArRC`<br/>`Wl`<br/>`Merchant` | Тип транзакции <br/>`Ind` – денежный перевод между пользователями системы <br/>`Dt` – пополнение электронного кошелька (отправитель агент, а получатель - пользователь системы)<br/>`ArRC` – пополнение счета мобильной связи (перевод от пользователя системы к оператору мобильной связи )<br/>`Wl` – снятие электронных денег (отправитель - пользователь системы, получатель - оператор)<br/>`Merchant` – перевод от пользователя поставщику услуг или товаров |\n",
|
||||
"| State of operation | `SU` | `SU` – успешно |\n",
|
||||
"| Balance before (transaction sender) | Number | |\n",
|
||||
"| Balance before (transaction receiver) | Number | |\n",
|
||||
"| Balance after (transaction sender) | Number | |\n",
|
||||
"| Balance after (transaction receiver) | Number | |\n",
|
||||
"| Transaction timestamp (sender) | Datetime | |\n",
|
||||
"| Transaction timestamp (receiver) | Datetime | |\n",
|
||||
"| Sender account ID | Generated ID | |\n",
|
||||
"| Transaction timestamp | Datetime | |\n",
|
||||
"| Sender type | `EU`<br/>`RET` | |\n",
|
||||
"| Receiver type | `EU`<br/>`operator`<br/>`RET`<br/>`MER` | |\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Поскольку поле `State of operation` всегда имеет значение (`SU`) для всех транзакций, данный столбец предлагается удалить. \n",
|
||||
"Столбцы `Sender account ID` и `User ID (transaction sender)` идентичны, также столбцы `Transaction timestamp (sender)` и `Transaction timestamp (receiver)` идентичны стобцу `Transaction timestamp`, поэтому данные стобцы удалются (остается только `Transaction timestamp`). Также удаляюся столбцы с балансом, т.к. в текущей версии набора данных они не задействованы."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"User ID (sender) object\n",
|
||||
"User ID (receiver) object\n",
|
||||
"User account ID (sender) object\n",
|
||||
"User account ID (receiver) object\n",
|
||||
"Amount of transaction float64\n",
|
||||
"Type of transaction object\n",
|
||||
"Transaction timestamp object\n",
|
||||
"Sender type object\n",
|
||||
"Receiver type object\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df.dtypes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Статистика транзакций для каждого пользователя\n",
|
||||
"\n",
|
||||
"Традиционно начнем со статистического анализа данных. Рекомендуется расширить число рассчитываемых статистик, например, включив показатели, характеризующие частоту транзакций. Для такого вида мошенничества как кража телефона изменение частоты снятий является характерным признаком."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def init_stat_dict():\n",
|
||||
" stat_dict = dict()\n",
|
||||
" transaction_types = {\"Ind\", \"Wl\", \"Dt\", \"Merchant\", \"ArRC\"} \n",
|
||||
" for tran_type in transaction_types:\n",
|
||||
" amount_name = f\"Sent_amount_{tran_type}\"\n",
|
||||
" amount_median = f\"Sent_amount_{tran_type}_median\"\n",
|
||||
" amount_min = f\"Sent_amount_{tran_type}_min\"\n",
|
||||
" amount_max = f\"Sent_amount_{tran_type}_max\"\n",
|
||||
" tran_count = f\"Sent_{tran_type}_count\"\n",
|
||||
" rec_amount_name = f\"Received_amount_{tran_type}\"\n",
|
||||
" rec_amount_median = f\"Received_amount_{tran_type}_median\"\n",
|
||||
" rec_amount_min = f\"Received_amount_{tran_type}_min\"\n",
|
||||
" rec_amount_max = f\"Received_amount_{tran_type}_max\"\n",
|
||||
" rec_tran_count = f\"Received_{tran_type}_count\"\n",
|
||||
" \n",
|
||||
" stat_dict[amount_name] = 0\n",
|
||||
" stat_dict[amount_median] = 0\n",
|
||||
" stat_dict[amount_min] = 0\n",
|
||||
" stat_dict[amount_max] = 0\n",
|
||||
" stat_dict[tran_count] = 0\n",
|
||||
" stat_dict[rec_amount_name] = 0\n",
|
||||
" stat_dict[rec_amount_median] = 0\n",
|
||||
" stat_dict[rec_amount_min] = 0\n",
|
||||
" stat_dict[rec_amount_max] = 0\n",
|
||||
" stat_dict[rec_tran_count] = 0\n",
|
||||
"\n",
|
||||
" return stat_dict\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def get_stat_df(df):\n",
|
||||
" sent_unique_users = df[\"User ID (sender)\"].unique()\n",
|
||||
" received_unique_users = df[\"User ID (receiver)\"].unique()\n",
|
||||
" unique_users = np.unique(np.concatenate((sent_unique_users,received_unique_users),0))\n",
|
||||
" print(unique_users)\n",
|
||||
" stat_df = pd.DataFrame()\n",
|
||||
" stat_dict = init_stat_dict()\n",
|
||||
" transaction_types = {\"Ind\", \"Wl\", \"Dt\", \"Merchant\", \"ArRC\"}\n",
|
||||
" for user in unique_users:\n",
|
||||
" stat_dict = init_stat_dict() \n",
|
||||
" stat_dict[\"User ID\"] = user\n",
|
||||
"\n",
|
||||
" user_df = df.loc[(df[\"User ID (sender)\"] == user)]\n",
|
||||
" \n",
|
||||
" if (not user_df.empty):\n",
|
||||
" #stat_dict[\"User ID\"] = user\n",
|
||||
" \n",
|
||||
" stat_dict[\"Unique_receivers\"] = len(user_df[\"User ID (receiver)\"].unique())\n",
|
||||
" stat_dict[\"User type\"] = user_df[\"Sender type\"].unique()[0]\n",
|
||||
"\n",
|
||||
" for tran_type in transaction_types:\n",
|
||||
" amount_name = f\"Sent_amount_{tran_type}\"\n",
|
||||
" amount_median = f\"Sent_amount_{tran_type}_median\"\n",
|
||||
" amount_min = f\"Sent_amount_{tran_type}_min\"\n",
|
||||
" amount_max = f\"Sent_amount_{tran_type}_max\"\n",
|
||||
" tran_count = f\"Sent_{tran_type}_count\"\n",
|
||||
" stat_dict[amount_name] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].sum()\n",
|
||||
" stat_dict[amount_median] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].mean()\n",
|
||||
" stat_dict[amount_min] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].min()\n",
|
||||
" stat_dict[amount_max] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].max()\n",
|
||||
" stat_dict[tran_count] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].count()\n",
|
||||
" else:\n",
|
||||
" stat_dict[\"User type\"] = (df.loc[(df[\"User ID (receiver)\"]==user)])[\"Receiver type\"].unique()[0]\n",
|
||||
"\n",
|
||||
" user_df = df.loc[(df[\"User ID (receiver)\"] == user)]\n",
|
||||
" if (not user_df.empty):\n",
|
||||
" stat_dict[\"Unique_senders\"] = len(user_df[\"User ID (sender)\"].unique())\n",
|
||||
" for tran_type in transaction_types:\n",
|
||||
" rec_amount_name = f\"Received_amount_{tran_type}\"\n",
|
||||
" rec_amount_median = f\"Received_amount_{tran_type}_median\"\n",
|
||||
" rec_amount_min = f\"Received_amount_{tran_type}_min\"\n",
|
||||
" rec_amount_max = f\"Received_amount_{tran_type}_max\"\n",
|
||||
" rec_tran_count = f\"Received_{tran_type}_count\"\n",
|
||||
" stat_dict[rec_amount_name] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].sum()\n",
|
||||
" stat_dict[rec_amount_median] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].median()\n",
|
||||
" stat_dict[rec_amount_min] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].min()\n",
|
||||
" stat_dict[rec_amount_max] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].max()\n",
|
||||
" stat_dict[rec_tran_count] = (user_df.loc[user_df[\"Type of transaction\"]==tran_type])[\"Amount of transaction\"].count()\n",
|
||||
" \n",
|
||||
" df_temp = pd.DataFrame([stat_dict])\n",
|
||||
" \n",
|
||||
" #df_temp.head()\n",
|
||||
" stat_df = pd.concat([stat_df, df_temp])\n",
|
||||
" stat_df = stat_df.fillna(0)\n",
|
||||
" return stat_df\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Кстати, обратите внимание уникальных пользователей в системе 2009. Это больше, чем число уникальных отправителей и уникальных получателей, значит, какие то пользователи только отправляют деньги, а какие-то только получают."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['PN_EU_0_0' 'PN_EU_0_1' 'PN_EU_0_10' ... 'PN_Ret5' 'PN_Ret6' 'operator']\n",
|
||||
"(2009, 54)\n",
|
||||
" Sent_amount_Wl Sent_amount_Wl_median Sent_amount_Wl_min \n",
|
||||
"0 0.0 0.0 0.0 \\\n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"\n",
|
||||
" Sent_amount_Wl_max Sent_Wl_count Received_amount_Wl \n",
|
||||
"0 0.0 0 0.0 \\\n",
|
||||
"0 0.0 0 0.0 \n",
|
||||
"0 0.0 0 0.0 \n",
|
||||
"0 0.0 0 0.0 \n",
|
||||
"0 0.0 0 0.0 \n",
|
||||
"\n",
|
||||
" Received_amount_Wl_median Received_amount_Wl_min Received_amount_Wl_max \n",
|
||||
"0 0.0 0.0 0.0 \\\n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"0 0.0 0.0 0.0 \n",
|
||||
"\n",
|
||||
" Received_Wl_count ... Sent_Dt_count Received_amount_Dt \n",
|
||||
"0 0 ... 0 686643.36 \\\n",
|
||||
"0 0 ... 0 483467.30 \n",
|
||||
"0 0 ... 0 0.00 \n",
|
||||
"0 0 ... 0 0.00 \n",
|
||||
"0 0 ... 0 0.00 \n",
|
||||
"\n",
|
||||
" Received_amount_Dt_median Received_amount_Dt_min Received_amount_Dt_max \n",
|
||||
"0 27845.615 15965.17 41729.94 \\\n",
|
||||
"0 35925.855 8067.95 86422.48 \n",
|
||||
"0 0.000 0.00 0.00 \n",
|
||||
"0 0.000 0.00 0.00 \n",
|
||||
"0 0.000 0.00 0.00 \n",
|
||||
"\n",
|
||||
" Received_Dt_count User ID User type Unique_senders \n",
|
||||
"0 24 PN_EU_0_0 EU 2.0 \\\n",
|
||||
"0 12 PN_EU_0_1 EU 6.0 \n",
|
||||
"0 0 PN_EU_0_10 EU 2.0 \n",
|
||||
"0 0 PN_EU_0_100 EU 1.0 \n",
|
||||
"0 0 PN_EU_0_1000 EU 0.0 \n",
|
||||
"\n",
|
||||
" Unique_receivers \n",
|
||||
"0 0.0 \n",
|
||||
"0 0.0 \n",
|
||||
"0 2.0 \n",
|
||||
"0 1.0 \n",
|
||||
"0 1.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 54 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"stat_df = get_stat_df(df)\n",
|
||||
"print(stat_df.shape)\n",
|
||||
"# print(stat_df.head())\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Была выбрана часть статистик и построила проекции пользователей. Анализируемые поля были выбраны на основе анализа свойств возможных финансовых аномалий (т.е. просто эвристически:))."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ModuleNotFoundError",
|
||||
"evalue": "No module named 'sklearn'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[47], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mpandas\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mplotting\u001b[39;00m \u001b[39mimport\u001b[39;00m scatter_matrix\n\u001b[0;32m----> 2\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpreprocessing\u001b[39;00m \u001b[39mimport\u001b[39;00m StandardScaler\n\u001b[1;32m 3\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mpreprocessing\u001b[39;00m \u001b[39mimport\u001b[39;00m LabelEncoder\n\u001b[1;32m 4\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mdecomposition\u001b[39;00m \u001b[39mimport\u001b[39;00m PCA\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from pandas.plotting import scatter_matrix\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from sklearn.preprocessing import LabelEncoder\n",
|
||||
"from sklearn.decomposition import PCA\n",
|
||||
"from matplotlib.ticker import FormatStrFormatter\n",
|
||||
"import plotly.express as px"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Мошенничество, связанное с заражением бот-сетью."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"\n",
|
||||
"Согласно описанию сценария атаки: есть множество зараженных пользователей, которые переводят деньги какому-то пользователю (\"ослу\" или \"мулу\"), и уже он выполняет операции обналичивания денег. Рассмотрен простейщий вариант сценария: цепочка мулов состоит из одного звена. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'StandardScaler' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[48], line 11\u001b[0m\n\u001b[1;32m 8\u001b[0m x \u001b[39m=\u001b[39m stat_df[MobileBot_labels]\u001b[39m.\u001b[39mvalues\n\u001b[1;32m 10\u001b[0m \u001b[39m# нормализуем значения\u001b[39;00m\n\u001b[0;32m---> 11\u001b[0m x \u001b[39m=\u001b[39m StandardScaler()\u001b[39m.\u001b[39mfit_transform(x)\n\u001b[1;32m 13\u001b[0m pca \u001b[39m=\u001b[39m PCA(n_components\u001b[39m=\u001b[39m\u001b[39m3\u001b[39m)\n\u001b[1;32m 14\u001b[0m principalComponents \u001b[39m=\u001b[39m pca\u001b[39m.\u001b[39mfit_transform(x)\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'StandardScaler' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\n",
|
||||
"#оставляем поля, связанные с переводами и снятиями и добавили число уникальных пользователей, это же бот сеть.\n",
|
||||
"\n",
|
||||
"MobileBot_labels = ['Unique_receivers','Unique_receivers','Sent_Ind_count' ,'Sent_Wl_count', 'Received_Ind_count']\n",
|
||||
"\n",
|
||||
"# а по этим полям будем пробовать найти пользователей с кражей телефона.\n",
|
||||
"MobileTheft_labels = ['Sent_amount_Wl', 'Sent_amount_Wl_median', 'Sent_amount_Wl_min', 'Sent_amount_Wl_max', 'Sent_Wl_count']\n",
|
||||
"\n",
|
||||
"x = stat_df[MobileBot_labels].values\n",
|
||||
"\n",
|
||||
"# нормализуем значения\n",
|
||||
"x = StandardScaler().fit_transform(x)\n",
|
||||
"\n",
|
||||
"pca = PCA(n_components=3)\n",
|
||||
"principalComponents = pca.fit_transform(x)\n",
|
||||
"print(f'Explained variance: {pca.explained_variance_ratio_}\\tSum: {pca.explained_variance_ratio_.sum()}')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.3"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -60,12 +60,12 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install --user scipy==1.8.1\n",
|
||||
"!pip install --user networkx==2.7.0\n",
|
||||
"%pip install --user scipy==1.8.1\n",
|
||||
"%pip install --user networkx==2.7.0\n",
|
||||
"#uncomment when running in Google Collab\n",
|
||||
"#!apt install python3-dev graphviz libgraphviz-dev pkg-config\n",
|
||||
"#!pip install pygraphviz\n",
|
||||
"!pip install pyvis\n",
|
||||
"%pip install pyvis\n",
|
||||
"\n",
|
||||
"import zipfile\n",
|
||||
"import itertools\n",
|
||||
@ -108,7 +108,7 @@
|
||||
" require.undef(\"plotly\");\n",
|
||||
" requirejs.config({\n",
|
||||
" paths: {\n",
|
||||
" 'plotly': ['https://cdn.plot.ly/plotly-2.12.1.min']\n",
|
||||
" 'plotly': ['https://cdn.plot.ly/plotly-2.20.0.min']\n",
|
||||
" }\n",
|
||||
" });\n",
|
||||
" require(['plotly'], function(Plotly) {\n",
|
||||
@ -162,7 +162,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
@ -172,10 +172,15 @@
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"['20130619.logDWH.complex.csv']\n"
|
||||
"ename": "FileNotFoundError",
|
||||
"evalue": "[Errno 2] No such file or directory: 'C:\\\\Practice\\\\data.zip'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[2], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m zip_filepath\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mC:\u001b[39m\u001b[39m\\\u001b[39m\u001b[39mPractice\u001b[39m\u001b[39m\\\u001b[39m\u001b[39mdata.zip\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 3\u001b[0m \u001b[39mwith\u001b[39;00m zipfile\u001b[39m.\u001b[39;49mZipFile(zip_filepath) \u001b[39mas\u001b[39;00m z:\n\u001b[1;32m 4\u001b[0m \u001b[39mprint\u001b[39m(z\u001b[39m.\u001b[39mnamelist())\n\u001b[1;32m 5\u001b[0m \u001b[39mfor\u001b[39;00m name \u001b[39min\u001b[39;00m z\u001b[39m.\u001b[39mnamelist():\n",
|
||||
"File \u001b[0;32m/usr/lib64/python3.11/zipfile.py:1283\u001b[0m, in \u001b[0;36mZipFile.__init__\u001b[0;34m(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps, metadata_encoding)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m 1282\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1283\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mfp \u001b[39m=\u001b[39m io\u001b[39m.\u001b[39;49mopen(file, filemode)\n\u001b[1;32m 1284\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mOSError\u001b[39;00m:\n\u001b[1;32m 1285\u001b[0m \u001b[39mif\u001b[39;00m filemode \u001b[39min\u001b[39;00m modeDict:\n",
|
||||
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'C:\\\\Practice\\\\data.zip'"
|
||||
]
|
||||
}
|
||||
],
|
||||
@ -219,7 +224,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
@ -229,247 +234,19 @@
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Groundtruth</th>\n",
|
||||
" <th>User ID (sender)</th>\n",
|
||||
" <th>User ID (receiver)</th>\n",
|
||||
" <th>User account ID (sender)</th>\n",
|
||||
" <th>User account ID (receiver)</th>\n",
|
||||
" <th>Amount of transaction</th>\n",
|
||||
" <th>Type of transaction</th>\n",
|
||||
" <th>Transaction timestamp</th>\n",
|
||||
" <th>Sender type</th>\n",
|
||||
" <th>Receiver type</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>count</th>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848.0</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" <td>54848</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>unique</th>\n",
|
||||
" <td>8</td>\n",
|
||||
" <td>1868</td>\n",
|
||||
" <td>1536</td>\n",
|
||||
" <td>1868</td>\n",
|
||||
" <td>1536</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>4</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>top</th>\n",
|
||||
" <td>N_Reg_RC</td>\n",
|
||||
" <td>PN_Ret5</td>\n",
|
||||
" <td>operator</td>\n",
|
||||
" <td>RAcc5</td>\n",
|
||||
" <td>A0</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>ArRC</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>EU</td>\n",
|
||||
" <td>operator</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>freq</th>\n",
|
||||
" <td>28312</td>\n",
|
||||
" <td>2265</td>\n",
|
||||
" <td>28312</td>\n",
|
||||
" <td>2265</td>\n",
|
||||
" <td>28312</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>28312</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>41981</td>\n",
|
||||
" <td>28312</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>mean</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>55101.369953</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2011-07-22 23:58:30.741376256</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>min</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>0.32</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2011-01-06 00:09:01</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25%</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2320.885</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2011-06-20 20:11:10.500000</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>50%</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>6796.69</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2011-07-28 20:56:54</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>75%</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>82111.76</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2011-09-09 22:22:44.500000</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>max</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>1148351.48</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>2011-12-09 23:54:57</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>std</th>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td>87307.646401</td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" <td></td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Groundtruth User ID (sender) User ID (receiver) \\\n",
|
||||
"count 54848 54848 54848 \n",
|
||||
"unique 8 1868 1536 \n",
|
||||
"top N_Reg_RC PN_Ret5 operator \n",
|
||||
"freq 28312 2265 28312 \n",
|
||||
"mean \n",
|
||||
"min \n",
|
||||
"25% \n",
|
||||
"50% \n",
|
||||
"75% \n",
|
||||
"max \n",
|
||||
"std \n",
|
||||
"\n",
|
||||
" User account ID (sender) User account ID (receiver) \\\n",
|
||||
"count 54848 54848 \n",
|
||||
"unique 1868 1536 \n",
|
||||
"top RAcc5 A0 \n",
|
||||
"freq 2265 28312 \n",
|
||||
"mean \n",
|
||||
"min \n",
|
||||
"25% \n",
|
||||
"50% \n",
|
||||
"75% \n",
|
||||
"max \n",
|
||||
"std \n",
|
||||
"\n",
|
||||
" Amount of transaction Type of transaction \\\n",
|
||||
"count 54848.0 54848 \n",
|
||||
"unique 5 \n",
|
||||
"top ArRC \n",
|
||||
"freq 28312 \n",
|
||||
"mean 55101.369953 \n",
|
||||
"min 0.32 \n",
|
||||
"25% 2320.885 \n",
|
||||
"50% 6796.69 \n",
|
||||
"75% 82111.76 \n",
|
||||
"max 1148351.48 \n",
|
||||
"std 87307.646401 \n",
|
||||
"\n",
|
||||
" Transaction timestamp Sender type Receiver type \n",
|
||||
"count 54848 54848 54848 \n",
|
||||
"unique 2 4 \n",
|
||||
"top EU operator \n",
|
||||
"freq 41981 28312 \n",
|
||||
"mean 2011-07-22 23:58:30.741376256 \n",
|
||||
"min 2011-01-06 00:09:01 \n",
|
||||
"25% 2011-06-20 20:11:10.500000 \n",
|
||||
"50% 2011-07-28 20:56:54 \n",
|
||||
"75% 2011-09-09 22:22:44.500000 \n",
|
||||
"max 2011-12-09 23:54:57 \n",
|
||||
"std "
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
"ename": "TypeError",
|
||||
"evalue": "NDFrame.describe() got an unexpected keyword argument 'datetime_is_numeric'",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[4], line 44\u001b[0m\n\u001b[1;32m 38\u001b[0m df \u001b[39m=\u001b[39m df\u001b[39m.\u001b[39mdrop(\u001b[39m'\u001b[39m\u001b[39mBalance after (receiver)\u001b[39m\u001b[39m'\u001b[39m, axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m)\n\u001b[1;32m 43\u001b[0m df[\u001b[39m'\u001b[39m\u001b[39mGroundtruth\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m df[\u001b[39m'\u001b[39m\u001b[39mGroundtruth\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m.\u001b[39mstr\u001b[39m.\u001b[39mreplace(\u001b[39m'\u001b[39m\u001b[39m-\u001b[39m\u001b[39m'\u001b[39m, \u001b[39m'\u001b[39m\u001b[39m_\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m---> 44\u001b[0m df\u001b[39m.\u001b[39;49mdescribe(include\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mall\u001b[39;49m\u001b[39m'\u001b[39;49m, datetime_is_numeric\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\u001b[39m.\u001b[39mfillna(\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m)\n",
|
||||
"\u001b[0;31mTypeError\u001b[0m: NDFrame.describe() got an unexpected keyword argument 'datetime_is_numeric'"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.read_csv('/practice/FinFraud_Labelled.csv', sep='|', parse_dates=[16, 17, 22])\n",
|
||||
"df = pd.read_csv('./FinFraud_Labelled.csv', sep='|', parse_dates=[16, 17, 22])\n",
|
||||
"# в файлах с вариантом задания, разделитель - \";\" \n",
|
||||
"df.columns = [\n",
|
||||
" 'Groundtruth', \n",
|
||||
@ -512,7 +289,7 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"df['Groundtruth'] = df['Groundtruth'].str.replace('-', '_')\n",
|
||||
"df.describe(include='all', datetime_is_numeric=True).fillna('')"
|
||||
"df.describe(include='all').fillna('')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -2840,7 +2617,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.9"
|
||||
"version": "3.11.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
Loading…
x
Reference in New Issue
Block a user