Обнаружение злоумышленников в системе мобильных денежных переводов¶

Вариант 5

настройка окружения

In [4]:

# %pip install scipy==1.8.1
# %pip install networkx==2.7.0
# %pip install pyvis pandas numpy plotly

from functools import reduce
from pyvis import network as net

import pandas as pd
import numpy as np
import networkx as nx
import plotly.express as px
import plotly.graph_objects as go

from plotly.offline import iplot
from IPython.display import display, HTML




#for Jupiter notebooks
import plotly.io as pio #comment for Google collab
pio.renderers.default='notebook'#comment for Google collab

WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47347ab190>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0610>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c0b50>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c14d0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NewConnectionError('<pip._vendor.urllib3.connection.HTTPSConnection object at 0x7f47345c1ed0>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution')': /simple/scipy/
ERROR: Could not find a version that satisfies the requirement scipy==1.8.1 (from versions: none)
ERROR: No matching distribution found for scipy==1.8.1

[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: networkx==2.7.0 in ./.venv/lib64/python3.11/site-packages (2.7)

[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: pyvis in ./.venv/lib64/python3.11/site-packages (0.3.2)
Requirement already satisfied: pandas in ./.venv/lib64/python3.11/site-packages (2.0.1)
Requirement already satisfied: numpy in ./.venv/lib64/python3.11/site-packages (1.24.3)
Requirement already satisfied: plotly in ./.venv/lib64/python3.11/site-packages (5.14.1)
Requirement already satisfied: ipython>=5.3.0 in ./.venv/lib64/python3.11/site-packages (from pyvis) (8.13.2)
Requirement already satisfied: jinja2>=2.9.6 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.1.2)
Requirement already satisfied: jsonpickle>=1.4.1 in ./.venv/lib64/python3.11/site-packages (from pyvis) (3.0.1)
Requirement already satisfied: networkx>=1.11 in ./.venv/lib64/python3.11/site-packages (from pyvis) (2.7)
Requirement already satisfied: python-dateutil>=2.8.2 in ./.venv/lib64/python3.11/site-packages (from pandas) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3)
Requirement already satisfied: tzdata>=2022.1 in ./.venv/lib64/python3.11/site-packages (from pandas) (2023.3)
Requirement already satisfied: tenacity>=6.2.0 in ./.venv/lib64/python3.11/site-packages (from plotly) (8.2.2)
Requirement already satisfied: packaging in ./.venv/lib64/python3.11/site-packages (from plotly) (23.1)
Requirement already satisfied: backcall in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.2.0)
Requirement already satisfied: decorator in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.1.1)
Requirement already satisfied: jedi>=0.16 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.18.2)
Requirement already satisfied: matplotlib-inline in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.1.6)
Requirement already satisfied: pickleshare in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.7.5)
Requirement already satisfied: prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (3.0.38)
Requirement already satisfied: pygments>=2.4.0 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (2.15.1)
Requirement already satisfied: stack-data in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (0.6.2)
Requirement already satisfied: traitlets>=5 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (5.9.0)
Requirement already satisfied: pexpect>4.3 in ./.venv/lib64/python3.11/site-packages (from ipython>=5.3.0->pyvis) (4.8.0)
Requirement already satisfied: MarkupSafe>=2.0 in ./.venv/lib64/python3.11/site-packages (from jinja2>=2.9.6->pyvis) (2.1.2)
Requirement already satisfied: six>=1.5 in ./.venv/lib64/python3.11/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in ./.venv/lib64/python3.11/site-packages (from jedi>=0.16->ipython>=5.3.0->pyvis) (0.8.3)
Requirement already satisfied: ptyprocess>=0.5 in ./.venv/lib64/python3.11/site-packages (from pexpect>4.3->ipython>=5.3.0->pyvis) (0.7.0)
Requirement already satisfied: wcwidth in ./.venv/lib64/python3.11/site-packages (from prompt-toolkit!=3.0.37,<3.1.0,>=3.0.30->ipython>=5.3.0->pyvis) (0.2.6)
Requirement already satisfied: executing>=1.2.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (1.2.0)
Requirement already satisfied: asttokens>=2.1.0 in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (2.2.1)
Requirement already satisfied: pure-eval in ./.venv/lib64/python3.11/site-packages (from stack-data->ipython>=5.3.0->pyvis) (0.2.2)

[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.

In [5]:

def pyvis_deepnote_show(nt):
    tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name
    nt.save_graph(tmp_output_filename)

    f = open(tmp_output_filename, "r")
    display(HTML(f.read()))

In [42]:

df = pd.read_csv('./FinFraud_unknown.csv', sep=',', parse_dates=[15, 16, 21])

df.columns = [
    'User ID (sender)', 
    'User ID (receiver)',
    'User account ID (sender)',
    'User account ID (receiver)',
    'Amount of transaction',
    'Type of transaction',
    'State of operation',
    'Balance before (sender)',
    'Balance after (sender)',
    'Balance after (receiver)',
    'Balance before (receiver)',    
    'Not used',
    'Not used',
    'Not used',
    'Not used',
    'Transaction timestamp (sender)',
    'Transaction timestamp (receiver)',
    'Sender account ID',
    'Not used',
    'Not used',
    'Not used',
    'Transaction timestamp',
    'Sender type',
    'Receiver type'
]
df = df.loc[:, ~df.columns.str.contains('^Not used', case=False)].sort_values('Transaction timestamp') 
df = df.drop('State of operation', axis=1)
df = df.drop('Sender account ID', axis=1)
df = df.drop('Transaction timestamp (sender)', axis=1)
df = df.drop('Transaction timestamp (receiver)', axis=1)
df = df.drop('Balance before (sender)', axis=1)
df = df.drop('Balance after (sender)', axis=1)
df = df.drop('Balance before (receiver)', axis=1)
df = df.drop('Balance after (receiver)', axis=1)

df["Amount of transaction"] = pd.to_numeric(df["Amount of transaction"], errors='coerce').fillna(0)


df.describe(include='all').fillna('')

Out[42]:

	User ID (sender)	User ID (receiver)	User account ID (sender)	User account ID (receiver)	Amount of transaction	Type of transaction	Transaction timestamp	Sender type	Receiver type
count	54030	54030	54030	54030	54030.0	54030	54030	54030	54030
unique	1861	1562	1861	1562		5	46394	2	4
top	PN_Ret4	operator	RAcc4	A0		ArRC	08.07.2011 15:16	EU	operator
freq	2256	27901	2256	27901		27901	5	41246	27901
mean					53083.47221
std					85834.97052
min					0.0
25%					2158.2525
50%					6257.375
75%					76821.9675
max					1053512.86

In [ ]:

Описание набора данных¶

Название столбца	Возможные значения	Описание
User ID (transaction sender)	Generated ID
User ID (transaction receiver)	Generated ID
User account ID (transaction sender)	Generated ID
User account ID (transaction receiver)	Generated ID
Amount of transaction	Number
Type of transaction	`Ind` `Dt` `ArRC` `Wl` `Merchant`	Тип транзакции `Ind` – денежный перевод между пользователями системы `Dt` – пополнение электронного кошелька (отправитель агент, а получатель - пользователь системы) `ArRC` – пополнение счета мобильной связи (перевод от пользователя системы к оператору мобильной связи ) `Wl` – снятие электронных денег (отправитель - пользователь системы, получатель - оператор) `Merchant` – перевод от пользователя поставщику услуг или товаров
State of operation	`SU`	`SU` – успешно
Balance before (transaction sender)	Number
Balance before (transaction receiver)	Number
Balance after (transaction sender)	Number
Balance after (transaction receiver)	Number
Transaction timestamp (sender)	Datetime
Transaction timestamp (receiver)	Datetime
Sender account ID	Generated ID
Transaction timestamp	Datetime
Sender type	`EU` `RET`
Receiver type	`EU` `operator` `RET` `MER`

Поскольку поле State of operation всегда имеет значение (SU) для всех транзакций, данный столбец предлагается удалить. Столбцы Sender account ID и User ID (transaction sender) идентичны, также столбцы Transaction timestamp (sender) и Transaction timestamp (receiver) идентичны стобцу Transaction timestamp, поэтому данные стобцы удалются (остается только Transaction timestamp). Также удаляюся столбцы с балансом, т.к. в текущей версии набора данных они не задействованы.

In [43]:

df.dtypes

Out[43]:

User ID (sender)               object
User ID (receiver)             object
User account ID (sender)       object
User account ID (receiver)     object
Amount of transaction         float64
Type of transaction            object
Transaction timestamp          object
Sender type                    object
Receiver type                  object
dtype: object

Статистика транзакций для каждого пользователя¶

Традиционно начнем со статистического анализа данных. Рекомендуется расширить число рассчитываемых статистик, например, включив показатели, характеризующие частоту транзакций. Для такого вида мошенничества как кража телефона изменение частоты снятий является характерным признаком.

In [44]:

def init_stat_dict():
    stat_dict = dict()
    transaction_types = {"Ind", "Wl", "Dt", "Merchant", "ArRC"} 
    for tran_type in transaction_types:
          amount_name = f"Sent_amount_{tran_type}"
          amount_median = f"Sent_amount_{tran_type}_median"
          amount_min = f"Sent_amount_{tran_type}_min"
          amount_max = f"Sent_amount_{tran_type}_max"
          tran_count = f"Sent_{tran_type}_count"
          rec_amount_name = f"Received_amount_{tran_type}"
          rec_amount_median = f"Received_amount_{tran_type}_median"
          rec_amount_min = f"Received_amount_{tran_type}_min"
          rec_amount_max = f"Received_amount_{tran_type}_max"
          rec_tran_count = f"Received_{tran_type}_count"
          
          stat_dict[amount_name] = 0
          stat_dict[amount_median] = 0
          stat_dict[amount_min] = 0
          stat_dict[amount_max] = 0
          stat_dict[tran_count] = 0
          stat_dict[rec_amount_name] = 0
          stat_dict[rec_amount_median] = 0
          stat_dict[rec_amount_min] = 0
          stat_dict[rec_amount_max] = 0
          stat_dict[rec_tran_count] = 0

    return stat_dict


def get_stat_df(df):
   sent_unique_users = df["User ID (sender)"].unique()
   received_unique_users = df["User ID (receiver)"].unique()
   unique_users = np.unique(np.concatenate((sent_unique_users,received_unique_users),0))
   print(unique_users)
   stat_df = pd.DataFrame()
   stat_dict = init_stat_dict()
   transaction_types = {"Ind", "Wl", "Dt", "Merchant", "ArRC"}
   for user in unique_users:
       stat_dict = init_stat_dict() 
       stat_dict["User ID"] = user

       user_df = df.loc[(df["User ID (sender)"] == user)]
       
       if (not user_df.empty):
          #stat_dict["User ID"] = user
          
          stat_dict["Unique_receivers"] = len(user_df["User ID (receiver)"].unique())
          stat_dict["User type"] = user_df["Sender type"].unique()[0]

          for tran_type in transaction_types:
              amount_name = f"Sent_amount_{tran_type}"
              amount_median = f"Sent_amount_{tran_type}_median"
              amount_min = f"Sent_amount_{tran_type}_min"
              amount_max = f"Sent_amount_{tran_type}_max"
              tran_count = f"Sent_{tran_type}_count"
              stat_dict[amount_name] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].sum()
              stat_dict[amount_median] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].mean()
              stat_dict[amount_min] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].min()
              stat_dict[amount_max] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].max()
              stat_dict[tran_count] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].count()
       else:
          stat_dict["User type"] = (df.loc[(df["User ID (receiver)"]==user)])["Receiver type"].unique()[0]

       user_df = df.loc[(df["User ID (receiver)"] == user)]
       if (not user_df.empty):
          stat_dict["Unique_senders"] = len(user_df["User ID (sender)"].unique())
          for tran_type in transaction_types:
              rec_amount_name = f"Received_amount_{tran_type}"
              rec_amount_median = f"Received_amount_{tran_type}_median"
              rec_amount_min = f"Received_amount_{tran_type}_min"
              rec_amount_max = f"Received_amount_{tran_type}_max"
              rec_tran_count = f"Received_{tran_type}_count"
              stat_dict[rec_amount_name] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].sum()
              stat_dict[rec_amount_median] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].median()
              stat_dict[rec_amount_min] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].min()
              stat_dict[rec_amount_max] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].max()
              stat_dict[rec_tran_count] = (user_df.loc[user_df["Type of transaction"]==tran_type])["Amount of transaction"].count()
       
       df_temp = pd.DataFrame([stat_dict])
       
       #df_temp.head()
       stat_df = pd.concat([stat_df, df_temp])
   stat_df = stat_df.fillna(0)
   return stat_df

Кстати, обратите внимание уникальных пользователей в системе 2009. Это больше, чем число уникальных отправителей и уникальных получателей, значит, какие то пользователи только отправляют деньги, а какие-то только получают.

In [46]:

stat_df = get_stat_df(df)
print(stat_df.shape)
# print(stat_df.head())

['PN_EU_0_0' 'PN_EU_0_1' 'PN_EU_0_10' ... 'PN_Ret5' 'PN_Ret6' 'operator']
(2009, 54)
   Sent_amount_Wl  Sent_amount_Wl_median  Sent_amount_Wl_min   
0             0.0                    0.0                 0.0  \
0             0.0                    0.0                 0.0   
0             0.0                    0.0                 0.0   
0             0.0                    0.0                 0.0   
0             0.0                    0.0                 0.0   

   Sent_amount_Wl_max  Sent_Wl_count  Received_amount_Wl   
0                 0.0              0                 0.0  \
0                 0.0              0                 0.0   
0                 0.0              0                 0.0   
0                 0.0              0                 0.0   
0                 0.0              0                 0.0   

   Received_amount_Wl_median  Received_amount_Wl_min  Received_amount_Wl_max   
0                        0.0                     0.0                     0.0  \
0                        0.0                     0.0                     0.0   
0                        0.0                     0.0                     0.0   
0                        0.0                     0.0                     0.0   
0                        0.0                     0.0                     0.0   

   Received_Wl_count  ...  Sent_Dt_count  Received_amount_Dt   
0                  0  ...              0           686643.36  \
0                  0  ...              0           483467.30   
0                  0  ...              0                0.00   
0                  0  ...              0                0.00   
0                  0  ...              0                0.00   

   Received_amount_Dt_median  Received_amount_Dt_min  Received_amount_Dt_max   
0                  27845.615                15965.17                41729.94  \
0                  35925.855                 8067.95                86422.48   
0                      0.000                    0.00                    0.00   
0                      0.000                    0.00                    0.00   
0                      0.000                    0.00                    0.00   

   Received_Dt_count       User ID  User type  Unique_senders   
0                 24     PN_EU_0_0         EU             2.0  \
0                 12     PN_EU_0_1         EU             6.0   
0                  0    PN_EU_0_10         EU             2.0   
0                  0   PN_EU_0_100         EU             1.0   
0                  0  PN_EU_0_1000         EU             0.0   

   Unique_receivers  
0               0.0  
0               0.0  
0               2.0  
0               1.0  
0               1.0  

[5 rows x 54 columns]

Была выбрана часть статистик и построила проекции пользователей. Анализируемые поля были выбраны на основе анализа свойств возможных финансовых аномалий (т.е. просто эвристически:)).

In [47]:

from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from matplotlib.ticker import FormatStrFormatter
import plotly.express as px

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[47], line 2
      1 from pandas.plotting import scatter_matrix
----> 2 from sklearn.preprocessing import StandardScaler
      3 from sklearn.preprocessing import LabelEncoder
      4 from sklearn.decomposition import PCA

ModuleNotFoundError: No module named 'sklearn'

Мошенничество, связанное с заражением бот-сетью.¶

Согласно описанию сценария атаки: есть множество зараженных пользователей, которые переводят деньги какому-то пользователю ("ослу" или "мулу"), и уже он выполняет операции обналичивания денег. Рассмотрен простейщий вариант сценария: цепочка мулов состоит из одного звена.

In [48]:

#оставляем поля, связанные с переводами и снятиями и добавили число уникальных пользователей, это же бот сеть.

MobileBot_labels = ['Unique_receivers','Unique_receivers','Sent_Ind_count' ,'Sent_Wl_count', 'Received_Ind_count']

# а по этим полям будем пробовать найти пользователей с кражей телефона.
MobileTheft_labels = ['Sent_amount_Wl', 'Sent_amount_Wl_median', 'Sent_amount_Wl_min', 'Sent_amount_Wl_max', 'Sent_Wl_count']

x = stat_df[MobileBot_labels].values

# нормализуем значения
x = StandardScaler().fit_transform(x)

pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
print(f'Explained variance: {pca.explained_variance_ratio_}\tSum: {pca.explained_variance_ratio_.sum()}')

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[48], line 11
      8 x = stat_df[MobileBot_labels].values
     10 # нормализуем значения
---> 11 x = StandardScaler().fit_transform(x)
     13 pca = PCA(n_components=3)
     14 principalComponents = pca.fit_transform(x)

NameError: name 'StandardScaler' is not defined

49 KiB Raw Permalink Blame History Unescape Escape

Обнаружение злоумышленников в системе мобильных денежных переводов¶

Описание набора данных¶

Статистика транзакций для каждого пользователя¶

Мошенничество, связанное с заражением бот-сетью.¶

49 KiB

Raw Permalink Blame History