import joblib
import numpy
from os.path import join
from sklearn.preprocessing import MultiLabelBinarizer
from languageflow.experiment import Experiment
from languageflow.transformer.count import CountVectorizer
from languageflow.transformer.tfidf import TfidfVectorizer
from languageflow.evaluation.validation import TrainTestSplitValidation
from languageflow.transformer.number import NumberRemover
[docs]class Flow:
"""
Pipeline to build a model
Examples
--------
>>> from languageflow.flow import Flow
>>> flow = Flow()
>>> flow.data(X, y)
>>> flow.transform(TfidfTransformer())
>>> model = Model(SGD(), "SGD")
>>> flow.add_model(model)
>>> flow.train()
"""
def __init__(self):
self.models = []
self.lc_range = [1]
self.result = []
self.validation_method = TrainTestSplitValidation()
self.scores = set()
self.log_folder = "."
self.export_folder = "."
self.transformers = []
[docs] def data(self, X=None, y=None, sentences=None):
"""
Add data to flow
"""
self.X = X
self.y = y
self.sentences = sentences
[docs] def add_model(self, model):
"""
Add model to flow
"""
self.models.append(model)
[docs] def add_score(self, score):
self.scores.add(score)
[docs] def set_learning_curve(self, start, stop, offset):
self.lc_range = numpy.arange(start, stop, offset)
[docs] def set_validation(self, validation):
self.validation_method = validation
[docs] def train(self):
"""
Train model with transformed data
"""
for i, model in enumerate(self.models):
N = [int(i * len(self.y)) for i in self.lc_range]
for n in N:
X = self.X[:n]
y = self.y[:n]
e = Experiment(X, y, model.estimator, self.scores,
self.validation_method)
e.log_folder = self.log_folder
e.train()
[docs] def export(self, model_name, export_folder):
"""
Export model and transformers to export_folder
Parameters
----------
model_name: string
name of model to export
export_folder: string
folder to store exported model and transformers
"""
for transformer in self.transformers:
if isinstance(transformer, MultiLabelBinarizer):
joblib.dump(transformer,
join(export_folder, "label.transformer.bin"),
protocol=2)
if isinstance(transformer, TfidfVectorizer):
joblib.dump(transformer,
join(export_folder, "tfidf.transformer.bin"),
protocol=2)
if isinstance(transformer, CountVectorizer):
joblib.dump(transformer,
join(export_folder, "count.transformer.bin"),
protocol=2)
if isinstance(transformer, NumberRemover):
joblib.dump(transformer,
join(export_folder, "number.transformer.bin"),
protocol=2)
model = [model for model in self.models if model.name == model_name][0]
e = Experiment(self.X, self.y, model.estimator, None)
model_filename = join(export_folder, "model.bin")
e.export(model_filename)