Iris classifier project

In this project, we will learn:

How to use data catalog
How to read from and write to data catalog
The basic of experiment tracking

Project setup

$ villard create iris-classifier
$ cd iris-classifier

Configuration

Villard supports YAML, JSON, and Jsonnet configuration files. In this example, we use Jsonnet to define the configuration due to its flexibility. (You might want to refer to https://jsonnet.org for more details).

Data catalog

Hard coding the data paths is not a good idea. Villard provides a data catalog to help you manage your data and to make the tracking of data easier. It defines everything related to data input and output, as well as its meta-data, inside the configuration file.

We will start to make use of data catalog for this example. At minimum, you need to define path and type for each entry in the data catalog. For all supported data types, you can refer to data types. Data placement follows cookiecutter recommendations.

/**
 Step params
 */
local preprocess_data_params = {
  data: 'data::local_iris',
};

local split_data_params = {
  data: 'ref::preprocess_data',
  train_frac: 0.8,
};

local train_model_params = {
  data: 'ref::split_data',
  model_class: 'obj::SVC',
  model_params: {
    kernel: 'rbf',
    C: 10,
  },
};

local make_inference_params = {
  // by default, the model is loaded from a file
  // defined in data catalog
  model: 'data::trained_model',
  feature_df: 'data::test_features',
  stdout: false,
};

local evaluate_model_params = {
  predicted_target_df: 'ref::make_inference',
  actual_target_df: 'data::test_target',
};


/**
 Main configuration entry point
 */
{
  data_catalog: {
    local_iris: {
      path: 'data/01_raw/iris.csv',
      type: 'DT_PANDAS_DATAFRAME',
      write_params: {
        index: false,
      },
    },
    trained_model: {
      path: 'data/03_output/model.pkl',
      type: 'DT_PICKLE',
      track_on_write: true,
    },
  },

  step_implementation_modules: ['steps.data_engineering', 'steps.data_science'],

  experiment_output_dir: 'experiment_output',

  pipeline_definition: {
    _default: self.data_engineering_pipeline + self.training_pipeline,

    data_engineering_pipeline: {
      preprocess_data: preprocess_data_params,
      split_data: split_data_params,
    },

    training_pipeline: {
      train_model: train_model_params,
      // Inference using freshly trained model instead of the loaded model.
      make_inference: make_inference_params { model: 'ref::train_model' },
      evaluate_model: evaluate_model_params,
    },

    inference_pipeline: {
      // Inference using loaded model (default action)
      make_inference: make_inference_params { stdout: true },
    },
  },
}

Tip

You can even split the Jsonnet file into separate files and import them into the main file. For example, you can put the data_catalog entries into data_catalog.libsonnet:

{
    "iris_raw": {
        "path": "data/01_raw/iris.csv",
        "type": "DT_PANDAS_DATAFRAME"
    },
    "trained_model": {
        "path": "data/03_output/trained_model.pkl",
        "type": "DT_PICKLE"
    }
}

then import it in the main configuration file as follows:

{
    "data_catalog": import "data_catalog.libsonnet",
    ...
}

The data engineering code `steps/data_engineering.py`

from typing import Tuple
import pandas as pd
from villard import pipeline
from sklearn.model_selection import train_test_split


@pipeline.step("preprocess_data")
def load_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = data
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    pipeline.track("trained_model", X)

    return X, y


@pipeline.step("split_data")
def split_data(
    data: Tuple[pd.DataFrame, pd.DataFrame], train_frac: float
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X, y = data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_frac)

    pipeline.write_data("test_features", X_test)
    pipeline.write_data("test_target", y_test)
    return X_train, X_test, y_train, y_test

The data science code `steps/data_science.py`

from typing import Any, Dict, Tuple

import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from villard import pipeline

from sklearn import svm, tree, linear_model

pipeline.register_object("SVC", svm.SVC)
pipeline.register_object("DecisionTreeClassifier", tree.DecisionTreeClassifier)
pipeline.register_object("LogisticRegression", linear_model.LogisticRegression)


@pipeline.step("train_model")
def train_model(
    data: Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame],
    model_class: Any,
    model_params: Dict[str, Any],
) -> Any:
    X_train, _, y_train, _ = data
    clf = model_class(**model_params).fit(X_train, y_train)

    pipeline.write_data("trained_model", clf)
    pipeline.track("trained_model", clf.__class__.__name__)
    for k, v in model_params.items():
        pipeline.track(k, v)
    return clf


@pipeline.step("make_inference")
def make_inference(model: Any, feature_df: pd.DataFrame, stdout: bool) -> pd.DataFrame:
    pred = model.predict(feature_df)
    if stdout:
        print(pred)
    return pred


@pipeline.step("evaluate_model")
def evaluate_model(
    actual_target_df: pd.DataFrame, predicted_target_df: pd.DataFrame
) -> Any:
    accuracy = accuracy_score(actual_target_df, predicted_target_df)
    print("Accuracy: ", accuracy)
    pipeline.track("test_accuracy", accuracy)
    return accuracy

Project setup

Configuration

Data catalog

The data engineering code steps/data_engineering.py

The data science code steps/data_science.py

The data engineering code `steps/data_engineering.py`

The data science code `steps/data_science.py`