updating data downloader script and dev dockerfile + fixing small issue in the experiments notebook

rafaelgreca · rafaelgreca · commit af34c01523e7 · 2024-08-26T11:33:22.000-03:00
diff --git a/data/download_data.sh b/data/download_data.sh
@@ -21,9 +21,11 @@ function parse_yaml {
 # setting important variables
 eval $(parse_yaml ../credentials.yaml "CONFIG_")
 
+# defining important variables
 export KAGGLE_USERNAME="$CONFIG_KAGGLE_USERNAME"
 export KAGGLE_KEY="$CONFIG_KAGGLE_KEY"
-s3_bucket="$CONFIG_S3"
+export AWS_ACCESS_KEY_ID="$CONFIG_AWS_ACCESS_KEY"
+export AWS_SECRET_ACCESS_KEY="$CONFIG_AWS_SECRET_KEY"
 
 # creating a folder within the temporary folder where the dataset will be temporarily saved
 mkdir /tmp/e2e-mlops-project/ && cd /tmp/e2e-mlops-project/
@@ -41,7 +43,7 @@ rm -f obesity-or-cvd-risk-classifyregressorcluster.zip
 mv ObesityDataSet.csv Original_ObesityDataSet.csv
 
 # copying the csv file to the s3 bucket
-aws s3 cp Original_ObesityDataSet.csv s3://$s3_bucket
+aws s3 cp Original_ObesityDataSet.csv s3://$"$CONFIG_S3"
 
 # deleting the create folder
 cd ~ && rm -rf /tmp/e2e-mlops-project
diff --git a/notebooks/dev_Dockerfile b/notebooks/dev_Dockerfile
@@ -1,5 +1,8 @@
-# using the lastest version of miniconda as a base for the Docker image
-FROM python:3.10.14-slim
+# using the lastest version of Ubuntu 22.04 as a base for the Docker image
+FROM ubuntu:22.04
+
+# installing Python and Unzip
+RUN apt-get update && apt-get install -y python3.10 python3.10-venv python3.10-dev python3-pip unzip
 
 # creating the root folder
 RUN mkdir -p /e2e-project/notebooks
diff --git a/notebooks/experimentations.ipynb b/notebooks/experimentations.ipynb
@@ -213,10 +213,10 @@
     "with open(\"VERSION\", \"r\") as f:\n",
     "    CODE_VERSION = f.readline().strip()\n",
     "\n",
-    "if content[\"EC2\"] != \"\":\n",
+    "if content[\"EC2\"] != \"YOUR_EC2_URL\":\n",
     "    mlflow.set_tracking_uri(f\"http://{content['EC2']}:5000\") \n",
     "else:\n",
-    "    mlflow.set_tracking_uri(f\"http://127.0.0.1:5000\") \n",
+    "    raise ValueError(\"You must set an EC2 url!\\n\")\n",
     "\n",
     "print(f\"Tracking Server URI: '{mlflow.get_tracking_uri()}'\")\n",
     "\n",
@@ -273,11 +273,14 @@
     }
    ],
    "source": [
+    "os.environ[\"AWS_ACCESS_KEY_ID\"] = content[\"AWS_ACCESS_KEY\"]\n",
+    "os.environ[\"AWS_SECRET_ACCESS_KEY\"] =content[\"AWS_SECRET_KEY\"]\n",
+    "\n",
     "# downloading artifacts from the aws s3 bucket\n",
-    "!aws s3 cp --recursive s3://{content[\"s3\"]}/artifacts {ARTIFACTS_OUTPUT_PATH}\n",
+    "!aws s3 cp --recursive s3://{content[\"S3\"]}/artifacts {ARTIFACTS_OUTPUT_PATH}\n",
     "\n",
     "# downloading models from the aws s3 bucket\n",
-    "!aws s3 cp --recursive s3://{content[\"s3\"]}/features {FEATURES_OUTPUT_PATH}\n"
+    "!aws s3 cp --recursive s3://{content[\"S3\"]}/features {FEATURES_OUTPUT_PATH}"
    ]
   },
   {
@@ -543,7 +546,7 @@
    "source": [
     "models = [dt, rf, xg, lg]\n",
     "min_features = math.floor(X_train.shape[1] * 0.2)\n",
-    "max_features = math.floor(X_train.shape[1] * 0.3)\n",
+    "max_features = math.floor(X_train.shape[1] * 0.5)\n",
     "\n",
     "# creating a new mlflow's experiment\n",
     "experiment_id = mlflow.create_experiment(\n",