Iris classifier project
In this project, we will learn:
- How to use data catalog
- How to read from and write to data catalog
- The basic of experiment tracking
Project setup
$ villard create iris-classifier
$ cd iris-classifier
Configuration
Villard supports YAML, JSON, and Jsonnet configuration files. In this example, we use Jsonnet to define the configuration due to its flexibility. (You might want to refer to https://jsonnet.org for more details).
Data catalog
Hard coding the data paths is not a good idea. Villard provides a data catalog to help you manage your data and to make the tracking of data easier. It defines everything related to data input and output, as well as its meta-data, inside the configuration file.
We will start to make use of data catalog for this example. At minimum, you need to define path
and type
for each entry in the data catalog. For all supported data types, you can refer to data types. Data placement follows cookiecutter recommendations.
/**
Step params
*/
= {
local preprocess_data_params data: 'data::local_iris',
;
}
= {
local split_data_params data: 'ref::preprocess_data',
train_frac: 0.8,
;
}
= {
local train_model_params data: 'ref::split_data',
model_class: 'obj::SVC',
model_params: {
kernel: 'rbf',
C: 10,
,
};
}
= {
local make_inference_params // by default, the model is loaded from a file
// defined in data catalog
model: 'data::trained_model',
feature_df: 'data::test_features',
stdout: false,
;
}
= {
local evaluate_model_params predicted_target_df: 'ref::make_inference',
actual_target_df: 'data::test_target',
;
}
/**
Main configuration entry point
*/
{data_catalog: {
local_iris: {
path: 'data/01_raw/iris.csv',
type: 'DT_PANDAS_DATAFRAME',
write_params: {
index: false,
,
},
}trained_model: {
path: 'data/03_output/model.pkl',
type: 'DT_PICKLE',
track_on_write: true,
,
},
}
step_implementation_modules: ['steps.data_engineering', 'steps.data_science'],
experiment_output_dir: 'experiment_output',
pipeline_definition: {
_default: self.data_engineering_pipeline + self.training_pipeline,
data_engineering_pipeline: {
preprocess_data: preprocess_data_params,
split_data: split_data_params,
,
}
training_pipeline: {
train_model: train_model_params,
// Inference using freshly trained model instead of the loaded model.
make_inference: make_inference_params { model: 'ref::train_model' },
evaluate_model: evaluate_model_params,
,
}
inference_pipeline: {
// Inference using loaded model (default action)
make_inference: make_inference_params { stdout: true },
,
},
} }
The data engineering code steps/data_engineering.py
from typing import Tuple
import pandas as pd
from villard import pipeline
from sklearn.model_selection import train_test_split
@pipeline.step("preprocess_data")
def load_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
= data
df = df.iloc[:, :-1]
X = df.iloc[:, -1]
y "trained_model", X)
pipeline.track(
return X, y
@pipeline.step("split_data")
def split_data(
float
data: Tuple[pd.DataFrame, pd.DataFrame], train_frac: -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
) = data
X, y = train_test_split(X, y, train_size=train_frac)
X_train, X_test, y_train, y_test
"test_features", X_test)
pipeline.write_data("test_target", y_test)
pipeline.write_data(return X_train, X_test, y_train, y_test
The data science code steps/data_science.py
from typing import Any, Dict, Tuple
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from villard import pipeline
from sklearn import svm, tree, linear_model
"SVC", svm.SVC)
pipeline.register_object("DecisionTreeClassifier", tree.DecisionTreeClassifier)
pipeline.register_object("LogisticRegression", linear_model.LogisticRegression)
pipeline.register_object(
@pipeline.step("train_model")
def train_model(
data: Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame],
model_class: Any,str, Any],
model_params: Dict[-> Any:
) = data
X_train, _, y_train, _ = model_class(**model_params).fit(X_train, y_train)
clf
"trained_model", clf)
pipeline.write_data("trained_model", clf.__class__.__name__)
pipeline.track(for k, v in model_params.items():
pipeline.track(k, v)return clf
@pipeline.step("make_inference")
def make_inference(model: Any, feature_df: pd.DataFrame, stdout: bool) -> pd.DataFrame:
= model.predict(feature_df)
pred if stdout:
print(pred)
return pred
@pipeline.step("evaluate_model")
def evaluate_model(
actual_target_df: pd.DataFrame, predicted_target_df: pd.DataFrame-> Any:
) = accuracy_score(actual_target_df, predicted_target_df)
accuracy print("Accuracy: ", accuracy)
"test_accuracy", accuracy)
pipeline.track(return accuracy