Anomaly Detection with Salesforce Merlion Package - Unsupervised learning with Isolation Forest, VAE, and E...

Anomaly Detection with Salesforce Merlion Package - Unsupervised learning with Isolation Forest, VAE, and Ensemble

Reference:

github: https://github.com/salesforce/Merlion

Steps

reference: example
Isolation Forest: sklearn
VAE: https://github.com/salesforce/Merlion/blob/main/merlion/models/anomaly/vae.py

download market data using yfinance: download S&P 500 (‘^GSPC')
calculate return 20 day max return (i.e. target in supervised learning problem):
- for each date (T):
  - calculate the max price change in next 20 trading dates: price_change = (max{close price in T+1 to T+20} - {close price on T})/({close price on T})
use Merlion to do unsupervised anomaly detection
1. Initializing an anomaly detection model: isolation forest, vae, ensemble
2. Training the model
3. Producing a series of anomaly scores with the models
4. Visualizing the anomaly scores
takeaways
- the correlation table: correlation score between the target and the anomaly score from different learning algorithms (isolation forest, vae, and ensemble of isolation forest/vae):
  - VAE shows higher correlation score compared to isolation forest and ensemble in training data while much lower correlation in testing data.
  - This could be an indicator that VAE is overfitting the training data and shows weaker generalization capacity in testing data (unseem data in training phase).
- Visually inspecting the target versus the anomaly scores in training and testing data
  - plotly output
  - Visually, VAE seems to be doing better than isolation forest in training data but worse in testing data.

import numpy as np
import pandas as pd
import statsmodels.api as sm

from datetime import datetime, timedelta
import yfinance as yf #to download stock price data
  

import matplotlib.pyplot as plt

from merlion.plot import plot_anoms
from merlion.utils import TimeSeries
  

np.random.seed(5678)
  

download S&P 500 price data

ticker = '^GSPC'
cur_data = yf.Ticker(ticker)
hist = cur_data.history(period="max")
print(ticker, hist.shape, hist.index.min())
  

^GSPC (19720, 7) 1927-12-30 00:00:00

df=hist[hist.index>='2000-01-01'].copy(deep=True)
df.head()
  

	Open	High	Low	Close	Volume	Dividends	Stock Splits
Date
2000-01-03	1469.250000	1478.000000	1438.359985	1455.219971	931800000	0	0
2000-01-04	1455.219971	1455.219971	1397.430054	1399.420044	1009000000	0	0
2000-01-05	1399.420044	1413.270020	1377.680054	1402.109985	1085500000	0	0
2000-01-06	1402.109985	1411.900024	1392.099976	1403.449951	1092300000	0	0
2000-01-07	1403.449951	1441.469971	1400.729980	1441.469971	1225200000	0	0

calcualte max return in next 20 trading days

#for each stock_id, get the max close in next 20 trading days
price_col = 'Close'
roll_len=20
new_col = 'next_20day_max'
target_list = []

df.sort_index(ascending=True, inplace=True)
df.head(3)
  

	Open	High	Low	Close	Volume	Dividends	Stock Splits
Date
2000-01-03	1469.250000	1478.000000	1438.359985	1455.219971	931800000	0	0
2000-01-04	1455.219971	1455.219971	1397.430054	1399.420044	1009000000	0	0
2000-01-05	1399.420044	1413.270020	1377.680054	1402.109985	1085500000	0	0

df_next20dmax=df[[price_col]].shift(1).rolling(roll_len).max()
df_next20dmax.columns=[new_col]
df = df.merge(df_next20dmax, right_index=True, left_index=True, how='inner')

df.dropna(how='any', inplace=True)
df['target']= 100*(df[new_col]-df[price_col])/df[price_col]  
  

df.head(3)
  

	Open	High	Low	Close	Volume	Dividends	Stock Splits	next_20day_max	target
Date
2000-02-01	1394.459961	1412.489990	1384.790039	1409.280029	981000000	0	0	1465.150024	3.964435
2000-02-02	1409.280029	1420.609985	1403.489990	1409.119995	1038600000	0	0	1465.150024	3.976243
2000-02-03	1409.119995	1425.780029	1398.520020	1424.969971	1146500000	0	0	1465.150024	2.819712

Merlion: Anomaly detection - unsupervised with Isolation Forest, VAE, and ensemble with default config

df.shape
  

(5478, 9)

train_ = df[['target']].iloc[:-400].copy(deep=True)
test_ = df[['target']].iloc[-400:].copy(deep=True)

train_data = TimeSeries.from_pd(train_)
test_data = TimeSeries.from_pd(test_)
  

# Import models & configs
from merlion.models.anomaly.isolation_forest import IsolationForest, IsolationForestConfig
from merlion.models.anomaly.vae import VAE, VAEConfig
from merlion.models.ensemble.anomaly import DetectorEnsemble, DetectorEnsembleConfig
from merlion.post_process.threshold import AggregateAlarms

# isolation forest
iso_forest_config = IsolationForestConfig()
iso_forest_model  = IsolationForest(iso_forest_config)

# VAE
vae_config = VAEConfig()
vae_model  = VAE(vae_config)

#ensemble
en_config = DetectorEnsembleConfig(threshold=AggregateAlarms(alm_threshold=4))
en_model = DetectorEnsemble(config=en_config, models=[iso_forest_model, vae_model])
  

iso_forest_train_score = iso_forest_model.train(train_data=train_data, anomaly_labels=None)
vae_train_score = vae_model.train(train_data=train_data, anomaly_labels=None)
en_train_score = en_model.train(train_data=train_data, anomaly_labels=None)
  

 |████████████████████████████████████████| 100.0% Complete, Loss 1.0673
 |████████████████████████████████████████| 100.0% Complete, Loss 1.1290
  

Model Inference
- model.get_anomaly_score() returns the model's raw anomaly scores,
- model.get_anomaly_label() returns the model's post-processed anomaly scores. The post-processing calibrates the anomaly scores to be interpretable as z-scores, and it also sparsifies them such that any nonzero values should be treated as an alert that a particular timestamp is anomalous.

df_train_scores = train_.merge(iso_forest_train_score.to_pd(), left_index=True, right_index=True, how='inner')
df_train_scores = df_train_scores.merge(vae_train_score.to_pd(), left_index=True, right_index=True, how='inner')
df_train_scores = df_train_scores.merge(en_train_score.to_pd(), left_index=True, right_index=True, how='inner')
print(df_train_scores.shape, train_.shape)
df_train_scores.head(2)
  

(5077, 4) (5078, 1)

	target	anom_score_x	anom_score_y	anom_score
2000-02-02	3.976243	0.333851	0.348187	0.220717
2000-02-03	2.819712	0.356733	0.063608	0.366427

if_test_scores = iso_forest_model.get_anomaly_score(test_data)
if_test_scores_df = if_test_scores.to_pd()

if_test_labels = iso_forest_model.get_anomaly_label(test_data)
if_test_labels_df = if_test_labels.to_pd()

vae_test_scores = vae_model.get_anomaly_score(test_data)
vae_test_scores_df = vae_test_scores.to_pd()

vae_test_labels = vae_model.get_anomaly_label(test_data)
vae_test_labels_df = vae_test_labels.to_pd()

en_test_scores = en_model.get_anomaly_score(test_data)
en_test_scores_df = en_test_scores.to_pd()

en_test_labels = en_model.get_anomaly_label(test_data)
en_test_labels_df = en_test_labels.to_pd()
  

df_test_scores = test_.merge(if_test_scores_df, left_index=True, right_index=True, how='inner')
df_test_scores = df_test_scores.merge(vae_test_scores_df, left_index=True, right_index=True, how='inner')
df_test_scores = df_test_scores.merge(en_test_scores_df, left_index=True, right_index=True, how='inner')

df_test_scores = df_test_scores.merge(if_test_labels_df, left_index=True, right_index=True, how='inner')
df_test_scores = df_test_scores.merge(vae_test_labels_df, left_index=True, right_index=True, how='inner')
df_test_scores = df_test_scores.merge(en_test_labels_df, left_index=True, right_index=True, how='inner')
print(test_.shape, df_test_scores.shape)
  

(400, 1) (399, 7)

df_train_scores.columns=['target', 'iso_forest_score', 'vae_score', 'ensemble_score']
df_test_scores.columns=['target', 'iso_forest_score', 'vae_score', 'ensemble_score', 'iso_forest_label', 'vae_label', 'ensemble_label']
  

df_test_scores.head(3)
  

	target	iso_forest_score	vae_score	ensemble_score
2020-04-09	-1.428052	0.387641	0.942809	1.400845
2020-04-13	1.020781	0.430124	0.355629	1.021738
2020-04-14	-1.976065	0.480604	1.075184	1.832749

df_train_scores.corr()
  

	target	iso_forest_score	vae_score	ensemble_score
target	1.000000	0.661876	0.828453	0.710455
iso_forest_score	0.661876	1.000000	0.606901	0.835899
vae_score	0.828453	0.606901	1.000000	0.801360
ensemble_score	0.710455	0.835899	0.801360	1.000000

df_test_scores.corr()
  

	target	iso_forest_score	vae_score	ensemble_score	iso_forest_label	vae_label	ensemble_label
target	1.000000	0.433776	0.083479	0.294011	NaN	NaN	NaN
iso_forest_score	0.433776	1.000000	0.027284	0.688255	NaN	NaN	NaN
vae_score	0.083479	0.027284	1.000000	0.686652	NaN	NaN	NaN
ensemble_score	0.294011	0.688255	0.686652	1.000000	NaN	NaN	NaN
iso_forest_label	NaN	NaN	NaN	NaN	NaN	NaN	NaN
vae_label	NaN	NaN	NaN	NaN	NaN	NaN	NaN
ensemble_label	NaN	NaN	NaN	NaN	NaN	NaN	NaN

df_test_scores['iso_forest_label'].value_counts()
  

0.0    399
Name: iso_forest_label, dtype: int64
  

df_test_scores['vae_label'].value_counts()
  

0.0    399
Name: vae_label, dtype: int64
  

Visualizing the results

generate graphs using plotly, display graphs inline and export graphs to a HTML file.

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
  

fig_list =[]
  

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])
           


fig.add_trace(go.Scatter(
                        name="target",
                        mode="lines", x=df_train_scores.index,y=df_train_scores['target']),
              secondary_y=False
             )

fig.add_trace(go.Scatter(
                        name="iso_forest_score",
                        mode="lines", x=df_train_scores.index,y=df_train_scores['iso_forest_score']),
              secondary_y=True
             )

fig.add_trace(go.Scatter(
                        name="vae_score",
                        mode="lines", x=df_train_scores.index,y=df_train_scores['vae_score']),
              secondary_y=True
             )


fig.update_layout(hovermode="x unified", 
                  title_text="Merlion Anomaly Detection with Isolation Forest and VAE - training data"
                 )



# Set y-axes titles
fig.update_yaxes(title_text="<b>target</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>anomaly score: </b> isolation forest and vae", secondary_y=True)



fig.update_xaxes(
    title_text="date", 
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

fig_list.append(fig)
  

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])



fig.add_trace(go.Scatter(
                        name="target",
                        mode="lines", x=df_test_scores.index,y=df_test_scores['target']),
              secondary_y=False
             )

fig.add_trace(go.Scatter(
                        name="iso_forest_score",
                        mode="lines", x=df_test_scores.index,y=df_test_scores['iso_forest_score']),
              secondary_y=True
             )

fig.add_trace(go.Scatter(
                        name="vae_score",
                        mode="lines", x=df_test_scores.index,y=df_test_scores['vae_score']),
              secondary_y=True
             )


fig.update_layout(hovermode="x unified", 
                  title_text="Merlion Anomaly Detection with Isolation Forest and VAE - testing data"
                 )



# Set y-axes titles
fig.update_yaxes(title_text="<b>target</b>", secondary_y=False)
fig.update_yaxes(title_text="<b>anomaly score: </b> isolation forest and vae", secondary_y=True)



fig.update_xaxes(
    title_text="date", 
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)
fig.show()

fig_list.append(fig)
  

fig_path = r'html/2_Merlion_Isoforest_VAE.html'
fig_list[0].write_html(fig_path)


with open(fig_path, 'a') as f:
    for fig_i in fig_list[1:]:
        f.write(fig_i.to_html(full_html=False, include_plotlyjs='cdn'))