Support Vector Machine

この記事では、機械学習における非線形分類手法の一つ、サポートベクターマシン(SVM)を紹介します。パラメータの調整さえ正しく行えば、手軽に精度が出せるので本研究室でも好んで使用されてきました。

本記事の対象者：化学構造データに対してSVMを適用して識別モデルを構築したい方。

本記事は以下のように構成されています。

SVMとは
SVM実践＠ケモインフォマティクス
まとめ

サポートベクターマシンとは

識別モデルを構築するための統計手法です。もともとは、線形のクラス分類法として提案されたようですが、非線形分類問題にも対応できます。アルゴリズムの根底にあるのはマージン最大化という考え方です。が、説明はここでは省略させていだきます。

SVM実践＠ケモインフォマティクス

今回は、ソフトマージンのSVM（カーネル：rbf）を構造活性ー相関データに対して適用して、テスト化合物に対する活性の有無を判定してみようと思います。

尚、解析中に使用しているクラスはメインのスクリプトの後に記載します。

import copy
import collections
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.svm import SVC

1. データの読み込み

予め重複を取り除いてある活性有化合物と活性無化合物を読み込みます。

active_mols  = [ mol for mol in Chem.SDMolSupplier("data/active_mols.sdf")]
inactive_mols = [ mol for mol in Chem.SDMolSupplier("data/inactive_mols.sdf")]

active_label   = np.ones(len(active_mols)) # 活性有化合物のラベル→１
inactive_label = np.zeros(len(inactive_mols)) # 活性無化合物のラベル→０

mols  = active_mols + inactive_mols
label = np.concatenate([active_label, inactive_label])

２．記述子の計算

今回はrdkit上で計算可能な記述子を使用します。

rdkit_desc_calculator = RdkitDescriptorCalculator()
descriptors   = rdkit_desc_calculator.calc_descripters(mols)

３．前処理

訓練データと検証用データを分けた後、スケーリングを行います。

X_train, X_test, y_train, y_test = model_selection.train_test_split(descriptors, label, test_size=0.25, random_state=0)
preprocessor = Preprocessor()
X_train_scaled = preprocessor.preprocess_train(X_train, scale_flag=True)
X_test_scaled  = preprocessor.preprocess_test(X_test, scale_flag=True)

４．SVMモデルの構築

いよいよモデルを構築します。グリッドサーチによりハイパーパラメータの調整を行います。

tuned_parameters = {'kernel': ['rbf'], 'gamma': [2 ** i for i in range(-15, 15, 2)], 'C': [2 ** i for i in range(-15, 15, 2)]}
classifier = model_selection.GridSearchCV(SVC(), tuned_parameters, cv=5, scoring="accuracy")

# モデルの構築
classifier.fit(X_train_scaled, y_train)
# モデル情報の確認
print(classifier.best_estimator_)

SVC(C=0.5, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001953125, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

# テストデータの活性を予測
y_pred = classifier.predict(X_test_scaled)

# 予測結果の確認
confusion_matrix(y_test, y_pred)

array([[154,   0],
       [  0, 135]])

accuracy_score(y_test, y_pred)

1.0 # 今回は完全な予測ができました。

rdkit 記述子計算用クラス

今回使用したクラスを紹介していきます。

class RdkitDescriptorCalculator:
    descriptors_names = [x[0] for x in Descriptors._descList]

    def __init__(self, descriptors_names=descriptors_names):
        self.descriptors_names = descriptors_names

    def calc_descripters(self, mols):
        descriptor_calc =　MoleculeDescriptors.MolecularDescriptorCalculator(self.descriptors_names)
        descriptors = [descriptor_calc.CalcDescriptors(mol) for mol in mols]
        df_descriptors = pd.DataFrame(descriptors)
        df_descriptors.columns = self.descriptors_names

        return df_descriptors

前処理用のクラス

class Preprocessor:
    def __init__(self, thr_nan=0.3, thr_same=0.8, thr_corr=0.8):
        self.thr_nan = thr_nan
        self.thr_same = thr_same
        self.thr_corr = thr_corr

    def preprocess_train(self, train_df, scale_flag=True):

        self.train_df = train_df

        # remove the rows with Nan by threshold.
        if self.thr_nan != 0:
            train_df = self.__remove_row_nan(self.thr_nan, train_df)

        # remove the columns containing Nan.
        train_df = train_df.dropna(axis=1)

        # remove the columns in which most values are the same by threshold.
        if self.thr_same != 0:
            train_df = self.__remove_col_same_value(self.thr_same, train_df)

        # remove the columns with high correlation coefficient.
        if self.thr_corr != 0:
            train_df = self.__remove_col_corr(self.thr_corr, train_df)

        # make the list about remaining columns.
        self.remaining_col_list = np.array(train_df.columns)

        # scaling the data.
        if scale_flag:
            train_matrix = train_df.as_matrix()
            scaler = preprocessing.StandardScaler().fit(train_matrix)
            self.scaler = scaler
            train_matrix_scaled = scaler.transform(train_matrix)
            train_df_scaled = pd.DataFrame(train_matrix_scaled)
            train_df_scaled.index = train_df.index
            train_df_scaled.columns = train_df.columns

        print("remaining variables: {}".format(train_df_scaled.shape[1]))

        return train_df_scaled

    def preprocess_test(self, test_df, scale_flag=True):

        # select the row left in preprocessed Train data.
        test_df = test_df[self.remaining_col_list]

        # dealing with Nan.
        for col in test_df.columns:
            test_df[col].fillna(self.train_df[col].mean(), inplace=True)

        # scaling the data.
        if scale_flag:
            test_matrix = test_df.as_matrix()
            test_matrix_scaled = self.scaler.transform(test_matrix)
            test_df_scaled = pd.DataFrame(test_matrix_scaled)
            test_df_scaled.index = test_df.index
            test_df_scaled.columns = test_df.columns

        return test_df_scaled

    def __remove_row_nan(self, thr_nan, df):
        """
        :param thr_nan:
        :param df:
        :return:
        """
        num_col = df.shape[1]
        num_row = df.shape[0]

        sample_index = np.arange(num_row)[::-1]

        for i in sample_index:
            num_nan = num_col - df.iloc[i, :].count()
            if num_nan * 1.0 / num_col >= thr_nan:
                df = df.drop(df.index[i])
        return df

    def __remove_col_same_value(self, thr_same, df):
        """
        :param thr_same:
        :param df:
        :return:
        """
        num_col = df.shape[1]
        num_row = df.shape[0]

        col_index = np.arange(num_col)[::-1]

        for i in col_index:
            col = df.iloc[:, i]
            num_most_frequent_value = collections.Counter(col).most_common(1)[0][1]
            if num_most_frequent_value / num_row >= thr_same:
                df = df.drop(df.columns[i], axis=1)
        return df

    def __remove_col_corr(self, thr_corr, df):

        matrix = df.as_matrix()
        corr_matrix = np.abs(np.corrcoef(matrix, rowvar=0))

        for i in np.arange(1, corr_matrix.shape[0])[::-1]:
            corr_list = corr_matrix[:i, i]
            if corr_list.max() >= thr_corr:
                df = df.drop(df.columns[i], axis=1)
        return df

まとめ

今回では、SVMの非常に簡単な説明と、rdkitやsklearnを用いた化合物の活性有無予測モデルの構築を行いました。

scikit-learnの公式ドキュメントを参照しながらいろいろと試してみてみると良いかと思います。皆さんのSVM理解の一助となれば幸いです。。

公式ドキュメント： sklearn.svm.SVC

Instruction of chemoinformatics

ケモインフォマティクスのオンライン入門書。