1
1
import pathlib
2
- from typing import List , Dict , Tuple
2
+ from typing import List , Dict
3
3
4
4
import numpy as np
5
5
import pandas as pd
6
6
from loguru import logger
7
7
from sklearn .preprocessing import StandardScaler , OneHotEncoder
8
8
9
- from .utils import custom_combiner , load_feature
9
+ from .utils import load_feature
10
10
from ..config .settings import general_settings
11
11
from ..config .model import model_settings
12
12
13
13
14
- def data_processing (dataframe : pd .DataFrame ) -> Tuple [ np .ndarray , np . ndarray ] :
14
+ def data_processing_inference (dataframe : pd .DataFrame ) -> np .ndarray :
15
15
"""Applies the data processing pipeline.
16
16
17
17
Args:
18
18
dataframe (pd.DataFrame): the dataframe.
19
19
20
20
Returns:
21
- Tuple[ np.ndarray, np.ndarray] : the features and labels array, respectively .
21
+ np.ndarray: the features array.
22
22
"""
23
- # First step) removing duplicates, changing the height unit, removing outliers
24
- logger .info ("Removing duplicates from the dataset." )
25
- dataframe = _remove_duplicates (dataframe )
26
-
23
+ # First step) changing the height unit
27
24
logger .info ("Changing the height units to centimeters." )
28
25
dataframe = _change_height_units (dataframe )
29
26
30
- logger .info ("Removing outliers from the dataset." )
31
- dataframe = _remove_outliers (dataframe )
32
-
33
27
# Feature engineering step)
34
28
# Creating the BMI feature
35
29
logger .info ("Creating a new column for the BMI values from the data samples." )
@@ -48,57 +42,50 @@ def data_processing(dataframe: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
48
42
columns_to_drop = [
49
43
"Height" ,
50
44
"Weight" ,
51
- "family_history_with_overweight" ,
52
- "FAVC" ,
53
- "NCP" ,
54
- "CH2O" ,
55
45
]
56
46
logger .info (f"Dropping the columns { columns_to_drop } ." )
57
47
dataframe = _drop_features (dataframe = dataframe , features = columns_to_drop )
58
48
59
49
# Transforming the AGE and IS columns into a categorical columns
60
50
logger .info ("Categorizing the numerical columns ('Age' and 'IS')." )
61
- dataframe = _categorize_numerical_columns (dataframe )
51
+ age_bins = load_feature (
52
+ path = general_settings .ARTIFACTS_PATH ,
53
+ feature_name = 'qcut_bins'
54
+ )
55
+ dataframe = _categorize_numerical_columns (dataframe , age_bins )
62
56
63
57
# Transforming (Log Transformation) numerical columns
64
58
dataframe = _transform_numerical_columns (dataframe )
65
59
66
- # Scaling numerical columns
60
+ # Loading the encoders and scalers
61
+ logger .info (f"Loading encoders 'features_ohe' from path { general_settings .ARTIFACTS_PATH } ." )
62
+ encoders = load_feature (
63
+ path = general_settings .ARTIFACTS_PATH ,
64
+ feature_name = 'features_ohe'
65
+ )
66
+
67
+ logger .info (f"Loading scalers 'features_sc' from path { general_settings .ARTIFACTS_PATH } ." )
67
68
sc = load_feature (
68
69
path = general_settings .ARTIFACTS_PATH ,
69
70
feature_name = 'features_sc'
70
71
)
72
+
73
+ # Scaling numerical columns
71
74
dataframe = _scale_numerical_columns (dataframe = dataframe , sc = sc )
72
75
73
76
# Encoding categorical columns
74
- encoders = load_feature (
75
- path = general_settings .ARTIFACTS_PATH ,
76
- feature_name = 'features_ohe'
77
- )
78
77
dataframe = _encode_categorical_columns (
79
78
dataframe = dataframe ,
80
79
encoders = encoders ,
81
80
target_column = general_settings .TARGET_COLUMN
82
81
)
83
82
84
83
# Selecting only the features that are important for the model
85
- dataframe = dataframe [model_settings .FEATURES + [ general_settings . TARGET_COLUMN ] ]
84
+ dataframe = dataframe [model_settings .FEATURES ]
86
85
logger .info (f"Filtering the features columns, keeping only { model_settings .FEATURES } columns." )
87
86
88
- # Splitting the data into X (features) and y (label)
89
- logger .info ("Splitting the dataset into X and y arrays." )
90
- X = dataframe .drop (columns = [general_settings .TARGET_COLUMN ]).values
91
- y = dataframe [general_settings .TARGET_COLUMN ].values
92
-
93
- # Encoding the labels array
94
- logger .info (f"Encoding the target column ({ general_settings .TARGET_COLUMN } )." )
95
- label_encoder = load_feature (
96
- path = general_settings .ARTIFACTS_PATH ,
97
- feature_name = 'label_ohe'
98
- )
99
- y = _encode_labels_array (array = y , encoder = label_encoder )
100
-
101
- return X , y
87
+ X = dataframe .values
88
+ return X
102
89
103
90
def _drop_features (
104
91
dataframe : pd .DataFrame ,
@@ -239,7 +226,8 @@ def _calculate_bmr(
239
226
return dataframe
240
227
241
228
def _categorize_numerical_columns (
242
- dataframe : pd .DataFrame
229
+ dataframe : pd .DataFrame ,
230
+ bins : pd .DataFrame ,
243
231
) -> pd .DataFrame :
244
232
"""Categorizes the numerical columns (e.g., transforming int to object/class).
245
233
@@ -249,7 +237,7 @@ def _categorize_numerical_columns(
249
237
Returns:
250
238
pd.DataFrame: the dataframe with all numerical columns categorized.
251
239
"""
252
- dataframe ["Age" ] = pd .qcut (x = dataframe ["Age" ], q = 4 , labels = ["q1" , "q2" , "q3" , "q4" ])
240
+ dataframe ["Age" ] = pd .cut (x = dataframe ["Age" ], bins = bins , labels = ["q1" , "q2" , "q3" , "q4" ])
253
241
dataframe ["Age" ] = dataframe ["Age" ].astype ("object" )
254
242
dataframe ["IS" ] = dataframe ["IS" ].astype ("object" )
255
243
return dataframe
@@ -315,7 +303,7 @@ def _encode_categorical_columns(
315
303
pd.DataFrame: the dataframe with all categorical columns encoded.
316
304
"""
317
305
categorical_columns = dataframe .select_dtypes (include = "object" ).columns .tolist ()
318
- categorical_columns .remove (target_column )
306
+ # categorical_columns.remove(target_column)
319
307
logger .info (f"Encoding the { categorical_columns } columns." )
320
308
321
309
new_dataframe = pd .DataFrame ()
0 commit comments