Skip to content

Commit 0f2e2eb

Browse files
committed
fixing minor issue in the data processing pipeline + adding api code source with inference endpoint
1 parent baf96d7 commit 0f2e2eb

12 files changed

+334
-218
lines changed

notebooks/data_processing.ipynb

+6-2
Original file line numberDiff line numberDiff line change
@@ -1023,7 +1023,10 @@
10231023
}
10241024
],
10251025
"source": [
1026-
"df[\"Age\"] = pd.qcut(x=df[\"Age\"], q=4, labels=[\"q1\", \"q2\", \"q3\", \"q4\"])\n",
1026+
"values, bins = pd.qcut(x=df[\"Age\"], q=4, retbins=True, labels=[\"q1\", \"q2\", \"q3\", \"q4\"])\n",
1027+
"bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))\n",
1028+
"\n",
1029+
"df[\"Age\"] = values\n",
10271030
"df[\"Age\"] = df[\"Age\"].astype(\"object\")\n",
10281031
"df.head()"
10291032
]
@@ -1820,7 +1823,7 @@
18201823
" sparse_output=False,\n",
18211824
" handle_unknown=\"infrequent_if_exist\",\n",
18221825
" min_frequency=20,\n",
1823-
" feature_name_combiner=custom_combiner,\n",
1826+
" # feature_name_combiner=custom_combiner,\n",
18241827
" )\n",
18251828
" \n",
18261829
" train_categorical_features = pd.DataFrame(\n",
@@ -1921,6 +1924,7 @@
19211924
"joblib.dump(scalers, os.path.join(ARTIFACTS_OUTPUT_PATH, \"features_sc.pkl\"))\n",
19221925
"joblib.dump(encoders, os.path.join(ARTIFACTS_OUTPUT_PATH, \"features_ohe.pkl\"))\n",
19231926
"joblib.dump(ohe_label, os.path.join(ARTIFACTS_OUTPUT_PATH, \"label_ohe.pkl\"))\n",
1927+
"joblib.dump(bins, os.path.join(ARTIFACTS_OUTPUT_PATH, \"qcut_bins.pkl\"))\n",
19241928
"\n",
19251929
"joblib.dump(X_train, os.path.join(FEATURES_OUTPUT_PATH, \"X_train.pkl\"))\n",
19261930
"joblib.dump(y_train, os.path.join(FEATURES_OUTPUT_PATH, \"y_train.pkl\"))\n",

notebooks/experimentations.ipynb

+106-116
Large diffs are not rendered by default.

src/api.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import pandas as pd
2+
from fastapi import FastAPI
3+
4+
from .data.processing import load_dataset, data_processing_inference
5+
from .config.settings import general_settings
6+
from .config.model import model_settings
7+
from .model.inference import ModelServe
8+
from .schema.person import Person
9+
from .schema.prediction import Prediction
10+
11+
12+
app = FastAPI()
13+
14+
@app.get("/version")
15+
def check_versions():
16+
with open("../notebooks/VERSION", "r", encoding="utf-8") as f:
17+
code_version = f.readline().strip()
18+
19+
return {
20+
"code_version": code_version,
21+
"model_version": model_settings.VERSION,
22+
}
23+
24+
@app.get("/predict")
25+
async def prediction(person: Person, response_model=Prediction):
26+
loaded_model = ModelServe(
27+
model_name=model_settings.MODEL_NAME,
28+
model_flavor=model_settings.MODEL_FLAVOR,
29+
model_version=model_settings.VERSION,
30+
)
31+
loaded_model.load()
32+
33+
data = pd.DataFrame.from_dict([person.model_dump()])
34+
X = data_processing_inference(data)
35+
36+
return {
37+
"predictions": loaded_model.predict(X).tolist()
38+
}

src/data/processing.py

+27-39
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,29 @@
11
import pathlib
2-
from typing import List, Dict, Tuple
2+
from typing import List, Dict
33

44
import numpy as np
55
import pandas as pd
66
from loguru import logger
77
from sklearn.preprocessing import StandardScaler, OneHotEncoder
88

9-
from .utils import custom_combiner, load_feature
9+
from .utils import load_feature
1010
from ..config.settings import general_settings
1111
from ..config.model import model_settings
1212

1313

14-
def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
14+
def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray:
1515
"""Applies the data processing pipeline.
1616
1717
Args:
1818
dataframe (pd.DataFrame): the dataframe.
1919
2020
Returns:
21-
Tuple[np.ndarray, np.ndarray]: the features and labels array, respectively.
21+
np.ndarray: the features array.
2222
"""
23-
# First step) removing duplicates, changing the height unit, removing outliers
24-
logger.info("Removing duplicates from the dataset.")
25-
dataframe = _remove_duplicates(dataframe)
26-
23+
# First step) changing the height unit
2724
logger.info("Changing the height units to centimeters.")
2825
dataframe = _change_height_units(dataframe)
2926

30-
logger.info("Removing outliers from the dataset.")
31-
dataframe = _remove_outliers(dataframe)
32-
3327
# Feature engineering step)
3428
# Creating the BMI feature
3529
logger.info("Creating a new column for the BMI values from the data samples.")
@@ -48,57 +42,50 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
4842
columns_to_drop = [
4943
"Height",
5044
"Weight",
51-
"family_history_with_overweight",
52-
"FAVC",
53-
"NCP",
54-
"CH2O",
5545
]
5646
logger.info(f"Dropping the columns {columns_to_drop}.")
5747
dataframe = _drop_features(dataframe=dataframe, features=columns_to_drop)
5848

5949
# Transforming the AGE and IS columns into a categorical columns
6050
logger.info("Categorizing the numerical columns ('Age' and 'IS').")
61-
dataframe = _categorize_numerical_columns(dataframe)
51+
age_bins = load_feature(
52+
path=general_settings.ARTIFACTS_PATH,
53+
feature_name='qcut_bins'
54+
)
55+
dataframe = _categorize_numerical_columns(dataframe, age_bins)
6256

6357
# Transforming (Log Transformation) numerical columns
6458
dataframe = _transform_numerical_columns(dataframe)
6559

66-
# Scaling numerical columns
60+
# Loading the encoders and scalers
61+
logger.info(f"Loading encoders 'features_ohe' from path {general_settings.ARTIFACTS_PATH}.")
62+
encoders = load_feature(
63+
path=general_settings.ARTIFACTS_PATH,
64+
feature_name='features_ohe'
65+
)
66+
67+
logger.info(f"Loading scalers 'features_sc' from path {general_settings.ARTIFACTS_PATH}.")
6768
sc = load_feature(
6869
path=general_settings.ARTIFACTS_PATH,
6970
feature_name='features_sc'
7071
)
72+
73+
# Scaling numerical columns
7174
dataframe = _scale_numerical_columns(dataframe=dataframe, sc=sc)
7275

7376
# Encoding categorical columns
74-
encoders = load_feature(
75-
path=general_settings.ARTIFACTS_PATH,
76-
feature_name='features_ohe'
77-
)
7877
dataframe = _encode_categorical_columns(
7978
dataframe=dataframe,
8079
encoders=encoders,
8180
target_column=general_settings.TARGET_COLUMN
8281
)
8382

8483
# Selecting only the features that are important for the model
85-
dataframe = dataframe[model_settings.FEATURES + [general_settings.TARGET_COLUMN]]
84+
dataframe = dataframe[model_settings.FEATURES]
8685
logger.info(f"Filtering the features columns, keeping only {model_settings.FEATURES} columns.")
8786

88-
# Splitting the data into X (features) and y (label)
89-
logger.info("Splitting the dataset into X and y arrays.")
90-
X = dataframe.drop(columns=[general_settings.TARGET_COLUMN]).values
91-
y = dataframe[general_settings.TARGET_COLUMN].values
92-
93-
# Encoding the labels array
94-
logger.info(f"Encoding the target column ({general_settings.TARGET_COLUMN}).")
95-
label_encoder = load_feature(
96-
path=general_settings.ARTIFACTS_PATH,
97-
feature_name='label_ohe'
98-
)
99-
y = _encode_labels_array(array=y, encoder=label_encoder)
100-
101-
return X, y
87+
X = dataframe.values
88+
return X
10289

10390
def _drop_features(
10491
dataframe: pd.DataFrame,
@@ -239,7 +226,8 @@ def _calculate_bmr(
239226
return dataframe
240227

241228
def _categorize_numerical_columns(
242-
dataframe: pd.DataFrame
229+
dataframe: pd.DataFrame,
230+
bins: pd.DataFrame,
243231
) -> pd.DataFrame:
244232
"""Categorizes the numerical columns (e.g., transforming int to object/class).
245233
@@ -249,7 +237,7 @@ def _categorize_numerical_columns(
249237
Returns:
250238
pd.DataFrame: the dataframe with all numerical columns categorized.
251239
"""
252-
dataframe["Age"] = pd.qcut(x=dataframe["Age"], q=4, labels=["q1", "q2", "q3", "q4"])
240+
dataframe["Age"] = pd.cut(x=dataframe["Age"], bins=bins, labels=["q1", "q2", "q3", "q4"])
253241
dataframe["Age"] = dataframe["Age"].astype("object")
254242
dataframe["IS"] = dataframe["IS"].astype("object")
255243
return dataframe
@@ -315,7 +303,7 @@ def _encode_categorical_columns(
315303
pd.DataFrame: the dataframe with all categorical columns encoded.
316304
"""
317305
categorical_columns = dataframe.select_dtypes(include="object").columns.tolist()
318-
categorical_columns.remove(target_column)
306+
# categorical_columns.remove(target_column)
319307
logger.info(f"Encoding the {categorical_columns} columns.")
320308

321309
new_dataframe = pd.DataFrame()

src/data/utils.py

-15
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from ..config.aws import aws_credentials
1212
from ..config.kaggle import kaggle_credentials
1313

14-
1514
def load_feature(
1615
path: pathlib.Path,
1716
feature_name: str,
@@ -28,20 +27,6 @@ def load_feature(
2827
logger.info(f"Loading feature/encoder/scaler from file {path}.")
2928
return joblib.load(pathlib.PosixPath.joinpath(path, f"{feature_name}.pkl"))
3029

31-
32-
def custom_combiner(feature, category) -> str:
33-
"""Auxiliary function that is used to rename the output's columns from
34-
the OneHotEncoder instance.
35-
36-
Args:
37-
feature (_type_): the feature's name (ignored).
38-
category (_type_): the category's name.
39-
40-
Returns:
41-
str: the current category from that given feature.
42-
"""
43-
return str(category)
44-
4530
@logger.catch
4631
def download_dataset(
4732
name: str,

src/main.py

-24
This file was deleted.

src/model/__init__.py

Whitespace-only changes.

src/model/builder.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import mlflow
2+
import numpy as np
3+
import pandas as pd
4+
from loguru import logger
5+
from sklearn.metrics import f1_score
6+
from sklearn.model_selection import train_test_split
7+
from xgboost import XGBClassifier
8+
9+
from ..config.model import model_settings
10+
from ..data.processing import data_processing
11+
12+
13+
class ModelBuilder:
14+
"""The trained model's class.
15+
"""
16+
def __init__(
17+
self,
18+
model_name: str,
19+
model_flavor: str,
20+
model_version: str,
21+
) -> None:
22+
"""Model's instance initializer.
23+
24+
Args:
25+
model_name (str): the model's name.
26+
model_flavor (str): the model's MLflow flavor.
27+
model_version (str): the model's version.
28+
"""
29+
self.model_name = model_name
30+
self.model_flavor = model_flavor
31+
self.model_version = model_version
32+
self.model = None
33+
34+
@logger.catch
35+
def train(self, dataframe: pd.DataFrame) -> None:
36+
logger.info("Pre-processing the data before training the model.")
37+
38+
# Pre-processing and cleaning the data
39+
X, y = data_processing(dataframe)
40+
41+
logger.info("Splitting the data into training and validation using 90/10 split.")
42+
43+
# Splitting the data into training and validation
44+
X_train, X_valid, y_train, y_valid = train_test_split(
45+
X,
46+
y,
47+
test_size=0.1,
48+
shuffle=True,
49+
stratify=y,
50+
)
51+
52+
logger.info("Training the model using the given data.")
53+
self.model = XGBClassifier()
54+
self.model.fit(X_train, y_train)
55+
56+
# Assessing the model's performance on the training set
57+
train_prediction = np.argmax(self.model.predict(X_train), axis=1)
58+
_y_train = np.argmax(y_train, axis=1).reshape(-1)
59+
score = f1_score(y_true=_y_train, y_pred=train_prediction, average="weighted")
60+
logger.info(f"Achieved a weighted F1-Score of {score} on the training set.")
61+
62+
# Assessing the model's performance on the validation set
63+
valid_prediction = np.argmax(self.model.predict(X_valid), axis=1)
64+
_y_valid = np.argmax(y_valid, axis=1).reshape(-1)
65+
score = f1_score(y_true=_y_valid, y_pred=valid_prediction, average="weighted")
66+
logger.info(f"Achieved a weighted F1-Score of {score} on the validation set.")

src/model.py src/model/inference.py

+10-22
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
import mlflow
22
import numpy as np
33
from loguru import logger
4-
from sklearn.metrics import f1_score
54

6-
from .config.model import model_settings
5+
from ..config.model import model_settings
6+
from ..config.settings import general_settings
7+
from ..data.utils import load_feature
78

9+
label_encoder = load_feature(
10+
path=general_settings.ARTIFACTS_PATH,
11+
feature_name='label_ohe'
12+
)
813

9-
class Model:
14+
class ModelServe:
1015
"""The trained model's class.
1116
"""
1217
def __init__(
@@ -53,24 +58,7 @@ def predict(self, x: np.ndarray) -> np.ndarray:
5358
Returns:
5459
np.ndarray: the predictions array.
5560
"""
56-
prediction = np.argmax(self.model.predict(x), axis=1)
61+
prediction = self.model.predict(x)
62+
prediction = label_encoder.inverse_transform(prediction)
5763
logger.info(f"Prediction: {prediction}.")
5864
return prediction
59-
60-
def score(self, x: np.ndarray, y: np.ndarray) -> float:
61-
"""Calculates the F1-Score of a trained model given a pair of features
62-
and labels arrays.
63-
64-
Args:
65-
x (np.ndarray): the features array.
66-
y (np.ndarray): the targets array.
67-
68-
Returns:
69-
float: the F1 Score value.
70-
"""
71-
prediction = self.predict(x).reshape(-1)
72-
_y = np.argmax(y, axis=1).reshape(-1)
73-
74-
score = f1_score(y_true=_y, y_pred=prediction, average="weighted")
75-
logger.info(f"Achieved a weighted F1-Score of {score}.")
76-
return score

src/schema/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)