{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a99d1fa8",
   "metadata": {
    "papermill": {
     "duration": 0.005626,
     "end_time": "2023-03-26T15:11:33.626518",
     "exception": false,
     "start_time": "2023-03-26T15:11:33.620892",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# **Prepare SVD for multiome**\n",
    "\n",
    "In this Jupyter notebook, data from train and test datasets is put together and then the TruncatedSVD is calculated. This is done twice: once for data normalized by organizers, and then for raw data. Only SVD features made from normalized data were used in a final submission.\n",
    "\n",
    "In kaggle environment it is more convenient to do this in a separate notebook, as it would be a waste of both time and GPU quota to calculate the TruncatedSVD each time before fitting the model.\n",
    "\n",
    "## Imports and definitions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2a7654bc",
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2023-03-26T15:11:33.637815Z",
     "iopub.status.busy": "2023-03-26T15:11:33.637242Z",
     "iopub.status.idle": "2023-03-26T15:11:34.849105Z",
     "shell.execute_reply": "2023-03-26T15:11:34.848042Z"
    },
    "papermill": {
     "duration": 1.221242,
     "end_time": "2023-03-26T15:11:34.852344",
     "exception": false,
     "start_time": "2023-03-26T15:11:33.631102",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Importing the libraries\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "import os\n",
    "import gc, scipy.sparse\n",
    "from humanize import naturalsize\n",
    "from sklearn.decomposition import TruncatedSVD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8fb19abf",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T15:11:34.864639Z",
     "iopub.status.busy": "2023-03-26T15:11:34.863570Z",
     "iopub.status.idle": "2023-03-26T15:11:47.509689Z",
     "shell.execute_reply": "2023-03-26T15:11:47.508400Z"
    },
    "papermill": {
     "duration": 12.655546,
     "end_time": "2023-03-26T15:11:47.512773",
     "exception": false,
     "start_time": "2023-03-26T15:11:34.857227",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "# Need this library to read *.h5 files\n",
    "!pip install --quiet tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ef0cfb17",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T15:11:47.525502Z",
     "iopub.status.busy": "2023-03-26T15:11:47.525034Z",
     "iopub.status.idle": "2023-03-26T15:11:47.533344Z",
     "shell.execute_reply": "2023-03-26T15:11:47.532143Z"
    },
    "papermill": {
     "duration": 0.017817,
     "end_time": "2023-03-26T15:11:47.535703",
     "exception": false,
     "start_time": "2023-03-26T15:11:47.517886",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "DATA_DIR = \"/kaggle/input/open-problems-multimodal/\"\n",
    "FP_CELL_METADATA = os.path.join(DATA_DIR,\"metadata.csv\")\n",
    "\n",
    "FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,\"train_cite_inputs.h5\")\n",
    "FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,\"train_cite_targets.h5\")\n",
    "FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,\"test_cite_inputs.h5\")\n",
    "\n",
    "FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,\"train_multi_inputs.h5\")\n",
    "FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,\"train_multi_targets.h5\")\n",
    "FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,\"test_multi_inputs.h5\")\n",
    "\n",
    "FP_SUBMISSION = os.path.join(DATA_DIR,\"sample_submission.csv\")\n",
    "FP_EVALUATION_IDS = os.path.join(DATA_DIR,\"evaluation_ids.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "23f2d01a",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T15:11:47.547201Z",
     "iopub.status.busy": "2023-03-26T15:11:47.546750Z",
     "iopub.status.idle": "2023-03-26T15:11:47.554602Z",
     "shell.execute_reply": "2023-03-26T15:11:47.553465Z"
    },
    "papermill": {
     "duration": 0.016003,
     "end_time": "2023-03-26T15:11:47.556793",
     "exception": false,
     "start_time": "2023-03-26T15:11:47.540790",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# The multiome train dataset raw data is too large to be loaded into RAM. But it is also sparse.\n",
    "# So, I load the dataset in chunks, and then convert it to sparse matrix.\n",
    "# Will use this function to do right that.\n",
    "\n",
    "\n",
    "def read_convert_hdf_in_chunks(link, chunk_size, sparse_matrice=None):\n",
    "    i = 0\n",
    "    while i < 1000000:\n",
    "        df_chunk = pd.read_hdf(link, start=i, stop=i+chunk_size)\n",
    "        sparse_chunk = scipy.sparse.csr_matrix(df_chunk.values)\n",
    "        if sparse_matrice == None:\n",
    "            sparse_matrice = sparse_chunk\n",
    "        else:\n",
    "            sparse_matrice = scipy.sparse.vstack([sparse_matrice, sparse_chunk])\n",
    "        print(i)\n",
    "        i += chunk_size\n",
    "        if sparse_chunk.shape[0] < chunk_size:\n",
    "            return sparse_matrice\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7faa8030",
   "metadata": {
    "papermill": {
     "duration": 0.004217,
     "end_time": "2023-03-26T15:11:47.566096",
     "exception": false,
     "start_time": "2023-03-26T15:11:47.561879",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Process the raw data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "6568c30d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T15:11:47.577155Z",
     "iopub.status.busy": "2023-03-26T15:11:47.576717Z",
     "iopub.status.idle": "2023-03-26T15:30:05.975430Z",
     "shell.execute_reply": "2023-03-26T15:30:05.973817Z"
    },
    "papermill": {
     "duration": 1098.408278,
     "end_time": "2023-03-26T15:30:05.979105",
     "exception": false,
     "start_time": "2023-03-26T15:11:47.570827",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5000\n",
      "10000\n",
      "15000\n",
      "20000\n",
      "25000\n",
      "30000\n",
      "35000\n",
      "40000\n",
      "45000\n",
      "50000\n",
      "55000\n",
      "60000\n",
      "65000\n",
      "70000\n",
      "75000\n",
      "80000\n",
      "85000\n",
      "90000\n",
      "95000\n",
      "100000\n",
      "105000\n",
      "105933\n",
      "CPU times: user 16min 52s, sys: 1min 13s, total: 18min 5s\n",
      "Wall time: 18min 18s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "34"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "# Loading raw data inputs\n",
    "\n",
    "sparse_X = read_convert_hdf_in_chunks('../input/open-problems-raw-counts/train_multi_inputs_raw.h5', 5000)\n",
    "print(sparse_X.shape[0])\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "802a3bb1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T15:30:05.993680Z",
     "iopub.status.busy": "2023-03-26T15:30:05.993289Z",
     "iopub.status.idle": "2023-03-26T15:39:49.838277Z",
     "shell.execute_reply": "2023-03-26T15:39:49.837072Z"
    },
    "papermill": {
     "duration": 583.855343,
     "end_time": "2023-03-26T15:39:49.840846",
     "exception": false,
     "start_time": "2023-03-26T15:30:05.985503",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5000\n",
      "10000\n",
      "15000\n",
      "20000\n",
      "25000\n",
      "30000\n",
      "35000\n",
      "40000\n",
      "45000\n",
      "50000\n",
      "55000\n",
      "161868\n",
      "CPU times: user 8min 49s, sys: 46.6 s, total: 9min 36s\n",
      "Wall time: 9min 43s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "75"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "# Same procedure for the test raw data.\n",
    "sparse_X = read_convert_hdf_in_chunks('/kaggle/input/open-problems-raw-counts/test_multi_inputs_raw.h5', 5000, sparse_X)\n",
    "print(sparse_X.shape[0])\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "544a6b3f",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T15:39:49.858139Z",
     "iopub.status.busy": "2023-03-26T15:39:49.856591Z",
     "iopub.status.idle": "2023-03-26T15:39:50.407194Z",
     "shell.execute_reply": "2023-03-26T15:39:50.405499Z"
    },
    "papermill": {
     "duration": 0.563082,
     "end_time": "2023-03-26T15:39:50.410897",
     "exception": false,
     "start_time": "2023-03-26T15:39:49.847815",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Export total_counts. Maybe they will be useful as a feature.\n",
    "total_counts = sparse_X.sum(axis=1)\n",
    "counts_index = [*range(len(total_counts))]\n",
    "total_counts = total_counts.flat\n",
    "df_total_counts = pd.DataFrame({'total_counts': total_counts}, index=counts_index)\n",
    "df_total_counts.to_feather('total_counts_multiome.ftr')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1a49d6ef",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T15:39:50.428010Z",
     "iopub.status.busy": "2023-03-26T15:39:50.427543Z",
     "iopub.status.idle": "2023-03-26T16:01:28.936016Z",
     "shell.execute_reply": "2023-03-26T16:01:28.933114Z"
    },
    "papermill": {
     "duration": 1298.537214,
     "end_time": "2023-03-26T16:01:28.955652",
     "exception": false,
     "start_time": "2023-03-26T15:39:50.418438",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of both before SVD: (161868, 228942)\n",
      "Shape of both after SVD:  (161868, 64)\n",
      "CPU times: user 21min 46s, sys: 9.47 s, total: 21min 56s\n",
      "Wall time: 21min 38s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Apply the singular value decomposition.\n",
    "\n",
    "print(f\"Shape of both before SVD: {sparse_X.shape}\")\n",
    "svd = TruncatedSVD(n_components=64, random_state=1)\n",
    "sparse_X = svd.fit_transform(sparse_X)\n",
    "print(f\"Shape of both after SVD:  {sparse_X.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "faec350c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T16:01:28.975382Z",
     "iopub.status.busy": "2023-03-26T16:01:28.974809Z",
     "iopub.status.idle": "2023-03-26T16:01:38.711577Z",
     "shell.execute_reply": "2023-03-26T16:01:38.710063Z"
    },
    "papermill": {
     "duration": 9.751836,
     "end_time": "2023-03-26T16:01:38.715236",
     "exception": false,
     "start_time": "2023-03-26T16:01:28.963400",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Raw data SVD ready\n"
     ]
    }
   ],
   "source": [
    "# Save results in a file.\n",
    "df_svd = pd.DataFrame(sparse_X)\n",
    "df_svd.to_csv('svd_raw.csv')\n",
    "print('Raw data SVD ready')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b9fb2fac",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T16:01:38.733851Z",
     "iopub.status.busy": "2023-03-26T16:01:38.733423Z",
     "iopub.status.idle": "2023-03-26T16:01:38.901106Z",
     "shell.execute_reply": "2023-03-26T16:01:38.899979Z"
    },
    "papermill": {
     "duration": 0.179907,
     "end_time": "2023-03-26T16:01:38.903684",
     "exception": false,
     "start_time": "2023-03-26T16:01:38.723777",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "21"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Free the RAM.\n",
    "del sparse_X, df_svd\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9cf65f9",
   "metadata": {
    "papermill": {
     "duration": 0.007482,
     "end_time": "2023-03-26T16:01:38.919004",
     "exception": false,
     "start_time": "2023-03-26T16:01:38.911522",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Process the normalized data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "05ae1937",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T16:01:38.935416Z",
     "iopub.status.busy": "2023-03-26T16:01:38.935020Z",
     "iopub.status.idle": "2023-03-26T16:23:21.382204Z",
     "shell.execute_reply": "2023-03-26T16:23:21.380465Z"
    },
    "papermill": {
     "duration": 1302.458497,
     "end_time": "2023-03-26T16:23:21.384886",
     "exception": false,
     "start_time": "2023-03-26T16:01:38.926389",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5000\n",
      "10000\n",
      "15000\n",
      "20000\n",
      "25000\n",
      "30000\n",
      "35000\n",
      "40000\n",
      "45000\n",
      "50000\n",
      "55000\n",
      "60000\n",
      "65000\n",
      "70000\n",
      "75000\n",
      "80000\n",
      "85000\n",
      "90000\n",
      "95000\n",
      "100000\n",
      "105000\n",
      "105942\n",
      "CPU times: user 18min 25s, sys: 1min 25s, total: 19min 51s\n",
      "Wall time: 21min 42s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "150"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "# Generally the same operations for the normalized data using the same function.\n",
    "# Load the train data in chunks and convert it to sparse matrix.\n",
    "\n",
    "sparse_X = read_convert_hdf_in_chunks(FP_MULTIOME_TRAIN_INPUTS, 5000)\n",
    "print(sparse_X.shape[0])\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b729e0e8",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T16:23:21.404437Z",
     "iopub.status.busy": "2023-03-26T16:23:21.403954Z",
     "iopub.status.idle": "2023-03-26T16:34:25.339363Z",
     "shell.execute_reply": "2023-03-26T16:34:25.337535Z"
    },
    "papermill": {
     "duration": 663.948827,
     "end_time": "2023-03-26T16:34:25.342225",
     "exception": false,
     "start_time": "2023-03-26T16:23:21.393398",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5000\n",
      "10000\n",
      "15000\n",
      "20000\n",
      "25000\n",
      "30000\n",
      "35000\n",
      "40000\n",
      "45000\n",
      "50000\n",
      "55000\n",
      "161877\n",
      "CPU times: user 9min 8s, sys: 50.7 s, total: 9min 59s\n",
      "Wall time: 11min 3s\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%%time\n",
    "# Same for normalized test dataset.\n",
    "\n",
    "sparse_X = read_convert_hdf_in_chunks(FP_MULTIOME_TEST_INPUTS, 5000, sparse_X)\n",
    "print(sparse_X.shape[0])\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "75155407",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T16:34:25.365178Z",
     "iopub.status.busy": "2023-03-26T16:34:25.364744Z",
     "iopub.status.idle": "2023-03-26T17:43:25.015549Z",
     "shell.execute_reply": "2023-03-26T17:43:25.012447Z"
    },
    "papermill": {
     "duration": 4139.678463,
     "end_time": "2023-03-26T17:43:25.030988",
     "exception": false,
     "start_time": "2023-03-26T16:34:25.352525",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape of both before SVD: (161877, 228942)\n",
      "Shape of both after SVD:  (161877, 256)\n",
      "CPU times: user 1h 9min 43s, sys: 29.3 s, total: 1h 10min 13s\n",
      "Wall time: 1h 8min 59s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Apply the singular value decomposition.\n",
    "# Normalized data is more important, so I will prepare more components.\n",
    "\n",
    "print(f\"Shape of both before SVD: {sparse_X.shape}\")\n",
    "svd = TruncatedSVD(n_components=256, random_state=1)\n",
    "sparse_X = svd.fit_transform(sparse_X)\n",
    "print(f\"Shape of both after SVD:  {sparse_X.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "087f142e",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-03-26T17:43:25.053242Z",
     "iopub.status.busy": "2023-03-26T17:43:25.052717Z",
     "iopub.status.idle": "2023-03-26T17:44:04.388397Z",
     "shell.execute_reply": "2023-03-26T17:44:04.387211Z"
    },
    "papermill": {
     "duration": 39.360847,
     "end_time": "2023-03-26T17:44:04.401455",
     "exception": false,
     "start_time": "2023-03-26T17:43:25.040608",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All the SVD ready\n"
     ]
    }
   ],
   "source": [
    "# Save results in a file.\n",
    "df_svd = pd.DataFrame(sparse_X)\n",
    "df_svd.to_csv('svd.csv')\n",
    "print('All the SVD ready')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  },
  "papermill": {
   "default_parameters": {},
   "duration": 9161.84408,
   "end_time": "2023-03-26T17:44:05.767493",
   "environment_variables": {},
   "exception": null,
   "input_path": "__notebook__.ipynb",
   "output_path": "__notebook__.ipynb",
   "parameters": {},
   "start_time": "2023-03-26T15:11:23.923413",
   "version": "2.3.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}