RachelMurray-Watson
diff --git a/‎CDC_Classifier_Period.ipynb
+87-12 b/‎CDC_Classifier_Period.ipynb
+87-12
diff --git a/‎CDC_classifier_auroc_0.8872_CDC_period_full.sav
2.34 KB b/‎CDC_classifier_auroc_0.8872_CDC_period_full.sav
2.34 KB
@@ -10383,7 +10383,34 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0\n"
+      "0\n",
+      "1\n",
+      "2\n",
+      "3\n",
+      "4\n",
+      "5\n",
+      "6\n",
+      "7\n",
+      "8\n",
+      "9\n",
+      "MCC: 0.7038956549471436\n",
+      "Accuracy: 0.9134016668263244\n",
+      "auROC: 0.8872189145023346\n"
+     ]
+    },
+    {
+     "ename": "ValueError",
+     "evalue": "Found input variables with inconsistent numbers of samples: [2409, 10439]",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[126], line 28\u001b[0m\n\u001b[1;32m     25\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mAccuracy:\u001b[39m\u001b[39m\"\u001b[39m, accuracy)\n\u001b[1;32m     26\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39m\"\u001b[39m\u001b[39mauROC:\u001b[39m\u001b[39m\"\u001b[39m, ROC)\n\u001b[0;32m---> 28\u001b[0m \u001b[39mprint\u001b[39m(confusion_matrix(y_test, y_pred))\n",
+      "File \u001b[0;32m~/miniconda3/envs/COVID_forecasting/lib/python3.11/site-packages/sklearn/utils/_param_validation.py:211\u001b[0m, in \u001b[0;36mvalidate_params.<locals>.decorator.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    205\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m    206\u001b[0m     \u001b[39mwith\u001b[39;00m config_context(\n\u001b[1;32m    207\u001b[0m         skip_parameter_validation\u001b[39m=\u001b[39m(\n\u001b[1;32m    208\u001b[0m             prefer_skip_nested_validation \u001b[39mor\u001b[39;00m global_skip_validation\n\u001b[1;32m    209\u001b[0m         )\n\u001b[1;32m    210\u001b[0m     ):\n\u001b[0;32m--> 211\u001b[0m         \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n\u001b[1;32m    212\u001b[0m \u001b[39mexcept\u001b[39;00m InvalidParameterError \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m    213\u001b[0m     \u001b[39m# When the function is just a wrapper around an estimator, we allow\u001b[39;00m\n\u001b[1;32m    214\u001b[0m     \u001b[39m# the function to delegate validation to the estimator, but we replace\u001b[39;00m\n\u001b[1;32m    215\u001b[0m     \u001b[39m# the name of the estimator by the name of the function in the error\u001b[39;00m\n\u001b[1;32m    216\u001b[0m     \u001b[39m# message to avoid confusion.\u001b[39;00m\n\u001b[1;32m    217\u001b[0m     msg \u001b[39m=\u001b[39m re\u001b[39m.\u001b[39msub(\n\u001b[1;32m    218\u001b[0m         \u001b[39mr\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m\\\u001b[39m\u001b[39mw+ must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m    219\u001b[0m         \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mparameter of \u001b[39m\u001b[39m{\u001b[39;00mfunc\u001b[39m.\u001b[39m\u001b[39m__qualname__\u001b[39m\u001b[39m}\u001b[39;00m\u001b[39m must be\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m    220\u001b[0m         \u001b[39mstr\u001b[39m(e),\n\u001b[1;32m    221\u001b[0m     )\n",
+      "File \u001b[0;32m~/miniconda3/envs/COVID_forecasting/lib/python3.11/site-packages/sklearn/metrics/_classification.py:326\u001b[0m, in \u001b[0;36mconfusion_matrix\u001b[0;34m(y_true, y_pred, labels, sample_weight, normalize)\u001b[0m\n\u001b[1;32m    231\u001b[0m \u001b[39m@validate_params\u001b[39m(\n\u001b[1;32m    232\u001b[0m     {\n\u001b[1;32m    233\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m: [\u001b[39m\"\u001b[39m\u001b[39marray-like\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    242\u001b[0m     y_true, y_pred, \u001b[39m*\u001b[39m, labels\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, sample_weight\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, normalize\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m\n\u001b[1;32m    243\u001b[0m ):\n\u001b[1;32m    244\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Compute confusion matrix to evaluate the accuracy of a classification.\u001b[39;00m\n\u001b[1;32m    245\u001b[0m \n\u001b[1;32m    246\u001b[0m \u001b[39m    By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    324\u001b[0m \u001b[39m    (0, 2, 1, 1)\u001b[39;00m\n\u001b[1;32m    325\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 326\u001b[0m     y_type, y_true, y_pred \u001b[39m=\u001b[39m _check_targets(y_true, y_pred)\n\u001b[1;32m    327\u001b[0m     \u001b[39mif\u001b[39;00m y_type \u001b[39mnot\u001b[39;00m \u001b[39min\u001b[39;00m (\u001b[39m\"\u001b[39m\u001b[39mbinary\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39mmulticlass\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m    328\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39m\"\u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m is not supported\u001b[39m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m y_type)\n",
+      "File \u001b[0;32m~/miniconda3/envs/COVID_forecasting/lib/python3.11/site-packages/sklearn/metrics/_classification.py:84\u001b[0m, in \u001b[0;36m_check_targets\u001b[0;34m(y_true, y_pred)\u001b[0m\n\u001b[1;32m     57\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_check_targets\u001b[39m(y_true, y_pred):\n\u001b[1;32m     58\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Check that y_true and y_pred belong to the same classification task.\u001b[39;00m\n\u001b[1;32m     59\u001b[0m \n\u001b[1;32m     60\u001b[0m \u001b[39m    This converts multiclass or binary types to a common shape, and raises a\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     82\u001b[0m \u001b[39m    y_pred : array or indicator matrix\u001b[39;00m\n\u001b[1;32m     83\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[0;32m---> 84\u001b[0m     check_consistent_length(y_true, y_pred)\n\u001b[1;32m     85\u001b[0m     type_true \u001b[39m=\u001b[39m type_of_target(y_true, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_true\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m     86\u001b[0m     type_pred \u001b[39m=\u001b[39m type_of_target(y_pred, input_name\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39my_pred\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "File \u001b[0;32m~/miniconda3/envs/COVID_forecasting/lib/python3.11/site-packages/sklearn/utils/validation.py:409\u001b[0m, in \u001b[0;36mcheck_consistent_length\u001b[0;34m(*arrays)\u001b[0m\n\u001b[1;32m    407\u001b[0m uniques \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39munique(lengths)\n\u001b[1;32m    408\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(uniques) \u001b[39m>\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m--> 409\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m    410\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mFound input variables with inconsistent numbers of samples: \u001b[39m\u001b[39m%r\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m    411\u001b[0m         \u001b[39m%\u001b[39m [\u001b[39mint\u001b[39m(l) \u001b[39mfor\u001b[39;00m l \u001b[39min\u001b[39;00m lengths]\n\u001b[1;32m    412\u001b[0m     )\n",
+      "\u001b[0;31mValueError\u001b[0m: Found input variables with inconsistent numbers of samples: [2409, 10439]"
      ]
     }
    ],
@@ -10414,13 +10441,12 @@
     "print(\"MCC:\", MCC)\n",
     "print(\"Accuracy:\", accuracy)\n",
     "print(\"auROC:\", ROC)\n",
-    "\n",
-    "print(confusion_matrix(y_test, y_pred))"
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [
     {
@@ -10453,12 +10479,61 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 165,
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MCC: 0.7038956549471436\n",
+      "Accuracy: 0.9134016668263244\n",
+      "auROC: 0.8872189145023346\n"
+     ]
+    }
+   ],
+   "source": [
+    "y_pred = clf.predict(X_test_full)\n",
+    "y_pred_proba = clf.predict_proba(X_test_full)\n",
+    "\n",
+    "# Evaluate the accuracy of the model\n",
+    "accuracy = accuracy_score(y_test_full, y_pred)\n",
+    "ROC = roc_auc_score(y_test_full, y_pred_proba[:,1])\n",
+    "MCC = (matthews_corrcoef(y_test_full, y_pred) + 1)/2\n",
+    "\n",
+    "print(\"MCC:\", MCC)\n",
+    "print(\"Accuracy:\", accuracy)\n",
+    "print(\"auROC:\", ROC)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#model_name = f\"CDC_classifier_auroc_{ROC:.4f}_CDC_period_full.sav\"\n",
-    "#pickle.dump(clf, open(model_name, 'wb'))"
+    "model_name = f\"CDC_classifier_auroc_{ROC:.4f}_CDC_period_full.sav\"\n",
+    "pickle.dump(clf, open(model_name, 'wb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'CDC_classifier_auroc_0.8872_CDC_period_full.sav'"
+      ]
+     },
+     "execution_count": 133,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_name"
    ]
   },
   {
@@ -10604,23 +10679,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 134,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "MCC: 0.47932871282255496\n",
-      "Accuracy: 0.14404317144043172\n",
-      "auROC: 0.531625516761377\n"
+      "MCC: 0.7089756779440055\n",
+      "Accuracy: 0.7667081776670818\n",
+      "auROC: 0.8185285531837256\n"
      ]
     }
    ],
    "source": [
     "X_test, y_test, weights_test, missing_data_test_HSA = prep_training_test_data(all_HSA_ID_weekly_data,   no_weeks = range(2, 5), weeks_in_future = 3,  geography = 'HSA_ID',  weight_col = 'weight', keep_output = True) # account for the fact that week 1 is the week included to allow for calculation of delta\n",
     "\n",
-    "full_model = pickle.load(open('/Users/rem76/Documents/COVID_projections/COVID_forecasting/CDC_classifier_auroc_0.9091_CDC_period_full.sav', 'rb'))\n",
+    "full_model = pickle.load(open('/Users/rem76/Documents/COVID_projections/COVID_forecasting/CDC_classifier_auroc_0.8872_CDC_period_full.sav', 'rb'))\n",
     "# Train the decision tree classifier\n",
     "\n",
     "# Make predictions on the test set\n",