Pipeline¶

Simple example of optimizing a scikit-learn pipeline
import warnings

import pandas as pd
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import cross_val_score

import torch

from grammaropt.grammar import build_grammar
from grammaropt.grammar import extract_rules_from_grammar
from grammaropt.grammar import as_str
from grammaropt.random import RandomWalker
from grammaropt.terminal_types import Int, Float
from grammaropt.rnn import RnnModel
from grammaropt.rnn import RnnAdapter
from grammaropt.rnn import RnnWalker

warnings.filterwarnings("ignore")

digits = datasets.load_digits()
dataset = digits


def evaluate(code):
    X = dataset.data
    y = dataset.target
    clf = _build_estimator(code)
    try:
        scores = cross_val_score(clf, X, y, cv=5)
    except Exception:
        return 0.
    else:
        return float(np.mean(scores))


def _build_estimator(code):
    clf = eval(code)
    return clf


def main():
    # grammar is simple scikit-learn pipeline
    rules = """
        pipeline = "make_pipeline" "(" elements "," estimator ")"
        elements = (element "," elements) / element
        element = pca / rf
        pca = "PCA" "(" "n_components" "=" int ")"
        estimator = rf / logistic
        logistic = "LogisticRegression" "(" ")"
        rf = "RandomForestClassifier" "(" "max_depth" "=" int "," "max_features" "=" float ")"
    """
    # build grammar
    types = {'int': Int(1, 10), 'float': Float(0., 1.)}
    grammar = build_grammar(rules, types=types)
    rules = extract_rules_from_grammar(grammar)
    tok_to_id = {r: i for i, r in enumerate(rules)}

    # set hyper-parameters and build RNN model
    nb_iter = 100
    vocab_size = len(rules)
    emb_size = 128
    hidden_size = 128
    nb_features = 2
    lr = 1e-4
    gamma = 0.9

    model = RnnModel(
        vocab_size=vocab_size,
        emb_size=emb_size,
        hidden_size=hidden_size,
        nb_features=nb_features)

    optim = torch.optim.Adam(model.parameters(), lr=lr)
    rnn = RnnAdapter(model, tok_to_id)

    # optimization loop
    acc_rnn = []
    R_avg, R_max = 0., 0.
    wl = RnnWalker(grammar=grammar, rnn=rnn)
    out = []
    for it in range(nb_iter):
        # walk to generate a parse tree
        wl.walk()
        # get the obtained terminals from the walker
        # and convert to a python code as str
        code = as_str(wl.terminals)
        # evaluate the code with python interpreter
        # and return the validation accuracy, which is the
        # reward
        R = evaluate(code)
        R_avg = R_avg * gamma + R * (1 - gamma)
        # update the model
        model.zero_grad()
        # loss is policy gradient : generate using the
        # model, observe reward R, then maximize the probability
        # of the generated decisions propoertionally to the reward.
        # `R_avg is the policy gradient baseline to make the gradient
        # have less variance.
        loss = (R - R_avg) * wl.compute_loss()
        loss.backward()
        optim.step()
        R_max = max(R, R_max)
        acc_rnn.append(R_max)
        out.append({"code": code, "R": R})
        print(code, R)
    df = pd.DataFrame(out)
    df = df.sort_values(by="R", ascending=False)
    df.to_csv('pipeline.csv')

if __name__ == '__main__':
    main()
Total running time of the script: ( 0 minutes 0.000 seconds)
Gallery generated by Sphinx-Gallery