49 KiB
Практическая работа №4
Обнаружение злоумышленников в системе мобильных денежных переводов¶
Вариант 5
- настройка окружения
# %pip install scipy==1.8.1 # %pip install networkx==2.7.0 # %pip install pyvis pandas numpy plotly from functools import reduce from pyvis import network as net import pandas as pd import numpy as np import networkx as nx import plotly.express as px import plotly.graph_objects as go from plotly.offline import iplot from IPython.display import display, HTML #for Jupiter notebooks import plotly.io as pio #comment for Google collab pio.renderers.default='notebook'#comment for Google collab
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47347ab190>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/ WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0610>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/ WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0b50>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/ WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c14d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/ WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c1ed0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/ ERROR: Could not find a version that satisfies the requirement scipy==1.8.1 (from versions: none) ERROR: No matching distribution found for scipy==1.8.1 [notice] A new release of pip available: 22.3.1 -> 23.1.2 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: networkx==2.7.0 in ./.venv/lib64/python3.11/site-packages (2.7) [notice] A new release of pip available: 22.3.1 -> 23.1.2 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: pyvis in ./.venv/lib64/python3.11/site-packages (0.3.2) Requirement already satisfied: pandas in ./.venv/lib64/python3.11/site-packages (2.0.1) Requirement already satisfied: numpy in ./.venv/lib64/python3.11/site-packages (1.24.3) Requirement already satisfied: plotly in ./.venv/lib64/python3.11/site-packages (5.14.1) Requirement already satisfied: ipython>=5.3.0 in ./.venv/lib64/python3.11/site-packages (from pyvis) (8.13.2) Requirement already satisfied: jinja2>=2.9.6 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.1.2) Requirement already satisfied: jsonpickle>=1.4.1 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.0.1) Requirement already satisfied: networkx>=1.11 in ./.venv/lib64/python3.11/site-packages (from pyvis) (2.7) Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib64/python3.11/site-packages (from pandas) (2.8.2) Requirement already satisfied: pytz>=2020.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3) Requirement already satisfied: tzdata>=2022.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3) Requirement already satisfied: tenacity>=6.2.0 in ./.venv/lib64/python3.11/site-packages (from plotly) (8.2.2) Requirement already satisfied: packaging in ./.venv/lib64/python3.11/site-packages (from plotly) (23.1) Requirement already satisfied: backcall in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.2.0) Requirement already satisfied: decorator in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.1.1) Requirement already satisfied: jedi>=0.16 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.18.2) Requirement already satisfied: matplotlib-inline in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.1.6) Requirement already satisfied: pickleshare in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.7.5) Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (3.0.38) Requirement already satisfied: pygments>=2.4.0 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (2.15.1) Requirement already satisfied: stack-data in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.6.2) Requirement already satisfied: traitlets>=5 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.9.0) Requirement already satisfied: pexpect>4.3 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (4.8.0) Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib64/python3.11/site-packages (from jinja2>=2.9.6->pyvis) (2.1.2) Requirement already satisfied: six>=1.5 in ./.venv/lib64/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Requirement already satisfied: parso<0.9.0,>=0.8.0 in ./.venv/lib64/python3.11/site-packages (from jedi>=0.16->ipython>=5.3.0->pyvis) (0.8.3) Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib64/python3.11/site-packages (from pexpect>4.3->ipython>=5.3.0->pyvis) (0.7.0) Requirement already satisfied: wcwidth in ./.venv/lib64/python3.11/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=5.3.0->pyvis) (0.2.6) Requirement already satisfied: executing>=1.2.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (1.2.0) Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (2.2.1) Requirement already satisfied: pure-eval in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (0.2.2) [notice] A new release of pip available: 22.3.1 -> 23.1.2 [notice] To update, run: pip install --upgrade pip Note: you may need to restart the kernel to use updated packages.
def pyvis_deepnote_show(nt): tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name nt.save_graph(tmp_output_filename) f = open(tmp_output_filename, "r") display(HTML(f.read()))
df = pd.read_csv('./FinFraud_unknown.csv', sep=',', parse_dates=[15, 16, 21]) df.columns = [ 'User ID (sender)', 'User ID (receiver)', 'User account ID (sender)', 'User account ID (receiver)', 'Amount of transaction', 'Type of transaction', 'State of operation', 'Balance before (sender)', 'Balance after (sender)', 'Balance after (receiver)', 'Balance before (receiver)', 'Not used', 'Not used', 'Not used', 'Not used', 'Transaction timestamp (sender)', 'Transaction timestamp (receiver)', 'Sender account ID', 'Not used', 'Not used', 'Not used', 'Transaction timestamp', 'Sender type', 'Receiver type' ] df = df.loc[:, ~df.columns.str.contains('^Not used', case=False)].sort_values('Transaction timestamp') df = df.drop('State of operation', axis=1) df = df.drop('Sender account ID', axis=1) df = df.drop('Transaction timestamp (sender)', axis=1) df = df.drop('Transaction timestamp (receiver)', axis=1) df = df.drop('Balance before (sender)', axis=1) df = df.drop('Balance after (sender)', axis=1) df = df.drop('Balance before (receiver)', axis=1) df = df.drop('Balance after (receiver)', axis=1) df["Amount of transaction"] = pd.to_numeric(df["Amount of transaction"], errors='coerce').fillna(0) df.describe(include='all').fillna('')
User ID (sender) | User ID (receiver) | User account ID (sender) | User account ID (receiver) | Amount of transaction | Type of transaction | Transaction timestamp | Sender type | Receiver type | |
---|---|---|---|---|---|---|---|---|---|
count | 54030 | 54030 | 54030 | 54030 | 54030.0 | 54030 | 54030 | 54030 | 54030 |
unique | 1861 | 1562 | 1861 | 1562 | 5 | 46394 | 2 | 4 | |
top | PN_Ret4 | operator | RAcc4 | A0 | ArRC | 08.07.2011 15:16 | EU | operator | |
freq | 2256 | 27901 | 2256 | 27901 | 27901 | 5 | 41246 | 27901 | |
mean | 53083.47221 | ||||||||
std | 85834.97052 | ||||||||
min | 0.0 | ||||||||
25% | 2158.2525 | ||||||||
50% | 6257.375 | ||||||||
75% | 76821.9675 | ||||||||
max | 1053512.86 |
Описание набора данных¶
Название столбца | Возможные значения | Описание |
---|---|---|
User ID (transaction sender) | Generated ID | |
User ID (transaction receiver) | Generated ID | |
User account ID (transaction sender) | Generated ID | |
User account ID (transaction receiver) | Generated ID | |
Amount of transaction | Number | |
Type of transaction | Ind Dt ArRC Wl Merchant |
Тип транзакции Ind – денежный перевод между пользователями системы Dt – пополнение электронного кошелька (отправитель агент, а получатель - пользователь системы)ArRC – пополнение счета мобильной связи (перевод от пользователя системы к оператору мобильной связи )Wl – снятие электронных денег (отправитель - пользователь системы, получатель - оператор)Merchant – перевод от пользователя поставщику услуг или товаров |
State of operation | SU |
SU – успешно |
Balance before (transaction sender) | Number | |
Balance before (transaction receiver) | Number | |
Balance after (transaction sender) | Number | |
Balance after (transaction receiver) | Number | |
Transaction timestamp (sender) | Datetime | |
Transaction timestamp (receiver) | Datetime | |
Sender account ID | Generated ID | |
Transaction timestamp | Datetime | |
Sender type | EU RET |
|
Receiver type | EU operator RET MER |
Поскольку поле State of operation
всегда имеет значение (SU
) для всех транзакций, данный столбец предлагается удалить.
Столбцы Sender account ID
и User ID (transaction sender)
идентичны, также столбцы Transaction timestamp (sender)
и Transaction timestamp (receiver)
идентичны стобцу Transaction timestamp
, поэтому данные стобцы удалются (остается только Transaction timestamp
). Также удаляюся столбцы с балансом, т.к. в текущей версии набора данных они не задействованы.
df.dtypes
User ID (sender) object User ID (receiver) object User account ID (sender) object User account ID (receiver) object Amount of transaction float64 Type of transaction object Transaction timestamp object Sender type object Receiver type object dtype: object
Статистика транзакций для каждого пользователя¶
Традиционно начнем со статистического анализа данных. Рекомендуется расширить число рассчитываемых статистик, например, включив показатели, характеризующие частоту транзакций. Для такого вида мошенничества как кража телефона изменение частоты снятий является характерным признаком.
def init_stat_dict(): stat_dict = dict() transaction_types = {"Ind", "Wl", "Dt", "Merchant", "ArRC"} for tran_type in transaction_types: amount_name = f"Sent_amount_{tran_type}" amount_median = f"Sent_amount_{tran_type}_median" amount_min = f"Sent_amount_{tran_type}_min" amount_max = f"Sent_amount_{tran_type}_max" tran_count = f"Sent_{tran_type}_count" rec_amount_name = f"Received_amount_{tran_type}" rec_amount_median = f"Received_amount_{tran_type}_median" rec_amount_min = f"Received_amount_{tran_type}_min" rec_amount_max = f"Received_amount_{tran_type}_max" rec_tran_count = f"Received_{tran_type}_count" stat_dict[amount_name] = 0 stat_dict[amount_median] = 0 stat_dict[amount_min] = 0 stat_dict[amount_max] = 0 stat_dict[tran_count] = 0 stat_dict[rec_amount_name] = 0 stat_dict[rec_amount_median] = 0 stat_dict[rec_amount_min] = 0 stat_dict[rec_amount_max] = 0 stat_dict[rec_tran_count] = 0 return stat_dict def get_stat_df(df): sent_unique_users = df["User ID (sender)"].unique() received_unique_users = df["User ID (receiver)"].unique() unique_users = np.unique(np.concatenate((sent_unique_users,received_unique_users),0)) print(unique_users) stat_df = pd.DataFrame() stat_dict = init_stat_dict() transaction_types = {"Ind", "Wl", "Dt", "Merchant", "ArRC"} for user in unique_users: stat_dict = init_stat_dict() stat_dict["User ID"] = user user_df = df.loc[(df["User ID (sender)"] == user)] if (not user_df.empty): #stat_dict["User ID"] = user stat_dict["Unique_receivers"] = len(user_df["User ID (receiver)"].unique()) stat_dict["User type"] = user_df["Sender type"].unique()[0] for tran_type in transaction_types: amount_name = f"Sent_amount_{tran_type}" amount_median = f"Sent_amount_{tran_type}_median" amount_min = f"Sent_amount_{tran_type}_min" amount_max = f"Sent_amount_{tran_type}_max" tran_count = f"Sent_{tran_type}_count" stat_dict[amount_name] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].sum() stat_dict[amount_median] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].mean() stat_dict[amount_min] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].min() stat_dict[amount_max] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].max() stat_dict[tran_count] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].count() else: stat_dict["User type"] = (df.loc[(df["User ID (receiver)"]==user)])["Receiver type"].unique()[0] user_df = df.loc[(df["User ID (receiver)"] == user)] if (not user_df.empty): stat_dict["Unique_senders"] = len(user_df["User ID (sender)"].unique()) for tran_type in transaction_types: rec_amount_name = f"Received_amount_{tran_type}" rec_amount_median = f"Received_amount_{tran_type}_median" rec_amount_min = f"Received_amount_{tran_type}_min" rec_amount_max = f"Received_amount_{tran_type}_max" rec_tran_count = f"Received_{tran_type}_count" stat_dict[rec_amount_name] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].sum() stat_dict[rec_amount_median] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].median() stat_dict[rec_amount_min] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].min() stat_dict[rec_amount_max] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].max() stat_dict[rec_tran_count] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].count() df_temp = pd.DataFrame([stat_dict]) #df_temp.head() stat_df = pd.concat([stat_df, df_temp]) stat_df = stat_df.fillna(0) return stat_df
Кстати, обратите внимание уникальных пользователей в системе 2009. Это больше, чем число уникальных отправителей и уникальных получателей, значит, какие то пользователи только отправляют деньги, а какие-то только получают.
stat_df = get_stat_df(df) print(stat_df.shape) # print(stat_df.head())
['PN_EU_0_0' 'PN_EU_0_1' 'PN_EU_0_10' ... 'PN_Ret5' 'PN_Ret6' 'operator'] (2009, 54) Sent_amount_Wl Sent_amount_Wl_median Sent_amount_Wl_min 0 0.0 0.0 0.0 \ 0 0.0 0.0 0.0 0 0.0 0.0 0.0 0 0.0 0.0 0.0 0 0.0 0.0 0.0 Sent_amount_Wl_max Sent_Wl_count Received_amount_Wl 0 0.0 0 0.0 \ 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 0 0.0 Received_amount_Wl_median Received_amount_Wl_min Received_amount_Wl_max 0 0.0 0.0 0.0 \ 0 0.0 0.0 0.0 0 0.0 0.0 0.0 0 0.0 0.0 0.0 0 0.0 0.0 0.0 Received_Wl_count ... Sent_Dt_count Received_amount_Dt 0 0 ... 0 686643.36 \ 0 0 ... 0 483467.30 0 0 ... 0 0.00 0 0 ... 0 0.00 0 0 ... 0 0.00 Received_amount_Dt_median Received_amount_Dt_min Received_amount_Dt_max 0 27845.615 15965.17 41729.94 \ 0 35925.855 8067.95 86422.48 0 0.000 0.00 0.00 0 0.000 0.00 0.00 0 0.000 0.00 0.00 Received_Dt_count User ID User type Unique_senders 0 24 PN_EU_0_0 EU 2.0 \ 0 12 PN_EU_0_1 EU 6.0 0 0 PN_EU_0_10 EU 2.0 0 0 PN_EU_0_100 EU 1.0 0 0 PN_EU_0_1000 EU 0.0 Unique_receivers 0 0.0 0 0.0 0 2.0 0 1.0 0 1.0 [5 rows x 54 columns]
Была выбрана часть статистик и построила проекции пользователей. Анализируемые поля были выбраны на основе анализа свойств возможных финансовых аномалий (т.е. просто эвристически:)).
from pandas.plotting import scatter_matrix from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import LabelEncoder from sklearn.decomposition import PCA from matplotlib.ticker import FormatStrFormatter import plotly.express as px
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[47], line 2 1 from pandas.plotting import scatter_matrix ----> 2 from sklearn.preprocessing import StandardScaler 3 from sklearn.preprocessing import LabelEncoder 4 from sklearn.decomposition import PCA ModuleNotFoundError: No module named 'sklearn'
Мошенничество, связанное с заражением бот-сетью.¶
Согласно описанию сценария атаки: есть множество зараженных пользователей, которые переводят деньги какому-то пользователю ("ослу" или "мулу"), и уже он выполняет операции обналичивания денег. Рассмотрен простейщий вариант сценария: цепочка мулов состоит из одного звена.
#оставляем поля, связанные с переводами и снятиями и добавили число уникальных пользователей, это же бот сеть. MobileBot_labels = ['Unique_receivers','Unique_receivers','Sent_Ind_count' ,'Sent_Wl_count', 'Received_Ind_count'] # а по этим полям будем пробовать найти пользователей с кражей телефона. MobileTheft_labels = ['Sent_amount_Wl', 'Sent_amount_Wl_median', 'Sent_amount_Wl_min', 'Sent_amount_Wl_max', 'Sent_Wl_count'] x = stat_df[MobileBot_labels].values # нормализуем значения x = StandardScaler().fit_transform(x) pca = PCA(n_components=3) principalComponents = pca.fit_transform(x) print(f'Explained variance: {pca.explained_variance_ratio_}\tSum: {pca.explained_variance_ratio_.sum()}')
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[48], line 11 8 x = stat_df[MobileBot_labels].values 10 # нормализуем значения ---> 11 x = StandardScaler().fit_transform(x) 13 pca = PCA(n_components=3) 14 principalComponents = pca.fit_transform(x) NameError: name 'StandardScaler' is not defined