rafaelgreca
diff --git a/‎notebooks/data_processing.ipynb
+6-2 b/‎notebooks/data_processing.ipynb
+6-2
diff --git a/‎notebooks/experimentations.ipynb
+106-116 b/‎notebooks/experimentations.ipynb
+106-116
diff --git a/‎src/api.py
+38 b/‎src/api.py
+38
diff --git a/‎src/data/processing.py
+27-39 b/‎src/data/processing.py
+27-39
diff --git a/‎src/data/utils.py
-15 b/‎src/data/utils.py
-15
diff --git a/‎src/main.py
-24 b/‎src/main.py
-24
diff --git a/‎src/model/__init__.py b/‎src/model/__init__.py
diff --git a/‎src/model/builder.py
+66 b/‎src/model/builder.py
+66
diff --git a/‎src/model.py ‎src/model/inference.py
+10-22 b/‎src/model.py ‎src/model/inference.py
+10-22
diff --git a/‎src/schema/__init__.py b/‎src/schema/__init__.py
@@ -1023,7 +1023,10 @@
     }
    ],
    "source": [
-    "df[\"Age\"] = pd.qcut(x=df[\"Age\"], q=4, labels=[\"q1\", \"q2\", \"q3\", \"q4\"])\n",
+    "values, bins = pd.qcut(x=df[\"Age\"], q=4, retbins=True, labels=[\"q1\", \"q2\", \"q3\", \"q4\"])\n",
+    "bins = np.concatenate(([-np.inf], bins[1:-1], [np.inf]))\n",
+    "\n",
+    "df[\"Age\"] = values\n",
     "df[\"Age\"] = df[\"Age\"].astype(\"object\")\n",
     "df.head()"
    ]
@@ -1820,7 +1823,7 @@
     "        sparse_output=False,\n",
     "        handle_unknown=\"infrequent_if_exist\",\n",
     "        min_frequency=20,\n",
-    "        feature_name_combiner=custom_combiner,\n",
+    "        # feature_name_combiner=custom_combiner,\n",
     "    )\n",
     "    \n",
     "    train_categorical_features = pd.DataFrame(\n",
@@ -1921,6 +1924,7 @@
     "joblib.dump(scalers, os.path.join(ARTIFACTS_OUTPUT_PATH, \"features_sc.pkl\"))\n",
     "joblib.dump(encoders, os.path.join(ARTIFACTS_OUTPUT_PATH, \"features_ohe.pkl\"))\n",
     "joblib.dump(ohe_label, os.path.join(ARTIFACTS_OUTPUT_PATH, \"label_ohe.pkl\"))\n",
+    "joblib.dump(bins, os.path.join(ARTIFACTS_OUTPUT_PATH, \"qcut_bins.pkl\"))\n",
     "\n",
     "joblib.dump(X_train, os.path.join(FEATURES_OUTPUT_PATH, \"X_train.pkl\"))\n",
     "joblib.dump(y_train, os.path.join(FEATURES_OUTPUT_PATH, \"y_train.pkl\"))\n",
 
@@ -0,0 +1,38 @@
+import pandas as pd
+from fastapi import FastAPI
+
+from .data.processing import load_dataset, data_processing_inference
+from .config.settings import general_settings
+from .config.model import model_settings
+from .model.inference import ModelServe
+from .schema.person import Person
+from .schema.prediction import Prediction
+
+
+app = FastAPI()
+
+@app.get("/version")
+def check_versions():
+    with open("../notebooks/VERSION", "r", encoding="utf-8") as f:
+        code_version = f.readline().strip()
+
+    return {
+        "code_version": code_version,
+        "model_version": model_settings.VERSION,
+    }
+
+@app.get("/predict")
+async def prediction(person: Person, response_model=Prediction):
+    loaded_model = ModelServe(
+        model_name=model_settings.MODEL_NAME,
+        model_flavor=model_settings.MODEL_FLAVOR,
+        model_version=model_settings.VERSION,
+    )
+    loaded_model.load()
+    
+    data = pd.DataFrame.from_dict([person.model_dump()])
+    X = data_processing_inference(data)
+
+    return {
+        "predictions": loaded_model.predict(X).tolist()
+    }
@@ -1,35 +1,29 @@
 import pathlib
-from typing import List, Dict, Tuple
+from typing import List, Dict
 
 import numpy as np
 import pandas as pd
 from loguru import logger
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 
-from .utils import custom_combiner, load_feature
+from .utils import load_feature
 from ..config.settings import general_settings
 from ..config.model import model_settings
 
 
-def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
+def data_processing_inference(dataframe: pd.DataFrame) -> np.ndarray:
     """Applies the data processing pipeline.
 
     Args:
         dataframe (pd.DataFrame): the dataframe.
 
     Returns:
-        Tuple[np.ndarray, np.ndarray]: the features and labels array, respectively.
+        np.ndarray: the features array.
     """
-    # First step) removing duplicates, changing the height unit, removing outliers
-    logger.info("Removing duplicates from the dataset.")
-    dataframe = _remove_duplicates(dataframe)
-
+    # First step) changing the height unit
     logger.info("Changing the height units to centimeters.")
     dataframe = _change_height_units(dataframe)
 
-    logger.info("Removing outliers from the dataset.")
-    dataframe = _remove_outliers(dataframe)
-
     # Feature engineering step)
     # Creating the BMI feature
     logger.info("Creating a new column for the BMI values from the data samples.")
@@ -48,57 +42,50 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
     columns_to_drop = [
         "Height",
         "Weight",
-        "family_history_with_overweight",
-        "FAVC",
-        "NCP",
-        "CH2O",
     ]
     logger.info(f"Dropping the columns {columns_to_drop}.")
     dataframe = _drop_features(dataframe=dataframe, features=columns_to_drop)
 
     # Transforming the AGE and IS columns into a categorical columns
     logger.info("Categorizing the numerical columns ('Age' and 'IS').")
-    dataframe = _categorize_numerical_columns(dataframe)
+    age_bins = load_feature(
+        path=general_settings.ARTIFACTS_PATH,
+        feature_name='qcut_bins'
+    )
+    dataframe = _categorize_numerical_columns(dataframe, age_bins)
 
     # Transforming (Log Transformation) numerical columns
     dataframe = _transform_numerical_columns(dataframe)
 
-    # Scaling numerical columns
+    # Loading the encoders and scalers
+    logger.info(f"Loading encoders 'features_ohe' from path {general_settings.ARTIFACTS_PATH}.")
+    encoders = load_feature(
+        path=general_settings.ARTIFACTS_PATH,
+        feature_name='features_ohe'
+    )
+
+    logger.info(f"Loading scalers 'features_sc' from path {general_settings.ARTIFACTS_PATH}.")
     sc = load_feature(
         path=general_settings.ARTIFACTS_PATH,
         feature_name='features_sc'
     )
+
+    # Scaling numerical columns
     dataframe = _scale_numerical_columns(dataframe=dataframe, sc=sc)
 
     # Encoding categorical columns
-    encoders = load_feature(
-        path=general_settings.ARTIFACTS_PATH,
-        feature_name='features_ohe'
-    )
     dataframe = _encode_categorical_columns(
         dataframe=dataframe,
         encoders=encoders,
         target_column=general_settings.TARGET_COLUMN
     )
 
     # Selecting only the features that are important for the model
-    dataframe = dataframe[model_settings.FEATURES + [general_settings.TARGET_COLUMN]]
+    dataframe = dataframe[model_settings.FEATURES]
     logger.info(f"Filtering the features columns, keeping only {model_settings.FEATURES} columns.")
 
-    # Splitting the data into X (features) and y (label)
-    logger.info("Splitting the dataset into X and y arrays.")
-    X = dataframe.drop(columns=[general_settings.TARGET_COLUMN]).values
-    y = dataframe[general_settings.TARGET_COLUMN].values
-
-    # Encoding the labels array
-    logger.info(f"Encoding the target column ({general_settings.TARGET_COLUMN}).")
-    label_encoder = load_feature(
-        path=general_settings.ARTIFACTS_PATH,
-        feature_name='label_ohe'
-    )
-    y = _encode_labels_array(array=y, encoder=label_encoder)
-
-    return X, y
+    X = dataframe.values
+    return X
 
 def _drop_features(
     dataframe: pd.DataFrame,
@@ -239,7 +226,8 @@ def _calculate_bmr(
     return dataframe
 
 def _categorize_numerical_columns(
-    dataframe: pd.DataFrame
+    dataframe: pd.DataFrame,
+    bins: pd.DataFrame,
 ) -> pd.DataFrame:
     """Categorizes the numerical columns (e.g., transforming int to object/class).
 
@@ -249,7 +237,7 @@ def _categorize_numerical_columns(
     Returns:
         pd.DataFrame: the dataframe with all numerical columns categorized.
     """
-    dataframe["Age"] = pd.qcut(x=dataframe["Age"], q=4, labels=["q1", "q2", "q3", "q4"])
+    dataframe["Age"] = pd.cut(x=dataframe["Age"], bins=bins, labels=["q1", "q2", "q3", "q4"])
     dataframe["Age"] = dataframe["Age"].astype("object")
     dataframe["IS"] = dataframe["IS"].astype("object")
     return dataframe
@@ -315,7 +303,7 @@ def _encode_categorical_columns(
         pd.DataFrame: the dataframe with all categorical columns encoded.
     """
     categorical_columns = dataframe.select_dtypes(include="object").columns.tolist()
-    categorical_columns.remove(target_column)
+    # categorical_columns.remove(target_column)
     logger.info(f"Encoding the {categorical_columns} columns.")
 
     new_dataframe = pd.DataFrame()
 
@@ -11,7 +11,6 @@
 from ..config.aws import aws_credentials
 from ..config.kaggle import kaggle_credentials
 
-
 def load_feature(
     path: pathlib.Path,
     feature_name: str,
@@ -28,20 +27,6 @@ def load_feature(
     logger.info(f"Loading feature/encoder/scaler from file {path}.")
     return joblib.load(pathlib.PosixPath.joinpath(path, f"{feature_name}.pkl"))
 
-
-def custom_combiner(feature, category) -> str:
-    """Auxiliary function that is used to rename the output's columns from
-    the OneHotEncoder instance.
-
-    Args:
-        feature (_type_): the feature's name (ignored).
-        category (_type_): the category's name.
-
-    Returns:
-        str: the current category from that given feature.
-    """
-    return str(category)
-
 @logger.catch
 def download_dataset(
     name: str,
 
@@ -0,0 +1,66 @@
+import mlflow
+import numpy as np
+import pandas as pd
+from loguru import logger
+from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+from xgboost import XGBClassifier
+
+from ..config.model import model_settings
+from ..data.processing import data_processing
+
+
+class ModelBuilder:
+    """The trained model's class.
+    """
+    def __init__(
+        self,
+        model_name: str,
+        model_flavor: str,
+        model_version: str,
+    ) -> None:
+        """Model's instance initializer.
+
+        Args:
+            model_name (str): the model's name.
+            model_flavor (str): the model's MLflow flavor.
+            model_version (str): the model's version.
+        """
+        self.model_name = model_name
+        self.model_flavor = model_flavor
+        self.model_version = model_version
+        self.model = None
+
+    @logger.catch
+    def train(self, dataframe: pd.DataFrame) -> None:
+        logger.info("Pre-processing the data before training the model.")
+
+        # Pre-processing and cleaning the data
+        X, y = data_processing(dataframe)
+
+        logger.info("Splitting the data into training and validation using 90/10 split.")
+
+        # Splitting the data into training and validation
+        X_train, X_valid, y_train, y_valid = train_test_split(
+            X,
+            y,
+            test_size=0.1,
+            shuffle=True,
+            stratify=y,
+        )
+
+        logger.info("Training the model using the given data.")
+        self.model = XGBClassifier()
+        self.model.fit(X_train, y_train)
+
+        # Assessing the model's performance on the training set
+        train_prediction = np.argmax(self.model.predict(X_train), axis=1)
+        _y_train = np.argmax(y_train, axis=1).reshape(-1)
+        score = f1_score(y_true=_y_train, y_pred=train_prediction, average="weighted")
+        logger.info(f"Achieved a weighted F1-Score of {score} on the training set.")
+
+        # Assessing the model's performance on the validation set
+        valid_prediction = np.argmax(self.model.predict(X_valid), axis=1)
+        _y_valid = np.argmax(y_valid, axis=1).reshape(-1)
+        score = f1_score(y_true=_y_valid, y_pred=valid_prediction, average="weighted")
+        logger.info(f"Achieved a weighted F1-Score of {score} on the validation set.")
@@ -1,12 +1,17 @@
 import mlflow
 import numpy as np
 from loguru import logger
-from sklearn.metrics import f1_score
 
-from .config.model import model_settings
+from ..config.model import model_settings
+from ..config.settings import general_settings
+from ..data.utils import load_feature
 
+label_encoder = load_feature(
+    path=general_settings.ARTIFACTS_PATH,
+    feature_name='label_ohe'
+)
 
-class Model:
+class ModelServe:
     """The trained model's class.
     """
     def __init__(
@@ -53,24 +58,7 @@ def predict(self, x: np.ndarray) -> np.ndarray:
         Returns:
             np.ndarray: the predictions array.
         """
-        prediction = np.argmax(self.model.predict(x), axis=1)
+        prediction = self.model.predict(x)
+        prediction = label_encoder.inverse_transform(prediction)
         logger.info(f"Prediction: {prediction}.")
         return prediction
-
-    def score(self, x: np.ndarray, y: np.ndarray) -> float:
-        """Calculates the F1-Score of a trained model given a pair of features
-        and labels arrays.
-
-        Args:
-            x (np.ndarray): the features array.
-            y (np.ndarray): the targets array.
-
-        Returns:
-            float: the F1 Score value.
-        """
-        prediction = self.predict(x).reshape(-1)
-        _y = np.argmax(y, axis=1).reshape(-1)
-
-        score = f1_score(y_true=_y, y_pred=prediction, average="weighted")
-        logger.info(f"Achieved a weighted F1-Score of {score}.")
-        return score