{ "cells": [ { "cell_type": "markdown", "id": "a99d1fa8", "metadata": { "papermill": { "duration": 0.005626, "end_time": "2023-03-26T15:11:33.626518", "exception": false, "start_time": "2023-03-26T15:11:33.620892", "status": "completed" }, "tags": [] }, "source": [ "# **Prepare SVD for multiome**\n", "\n", "In this Jupyter notebook, data from train and test datasets is put together and then the TruncatedSVD is calculated. This is done twice: once for data normalized by organizers, and then for raw data. Only SVD features made from normalized data were used in a final submission.\n", "\n", "In kaggle environment it is more convenient to do this in a separate notebook, as it would be a waste of both time and GPU quota to calculate the TruncatedSVD each time before fitting the model.\n", "\n", "## Imports and definitions" ] }, { "cell_type": "code", "execution_count": 1, "id": "2a7654bc", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2023-03-26T15:11:33.637815Z", "iopub.status.busy": "2023-03-26T15:11:33.637242Z", "iopub.status.idle": "2023-03-26T15:11:34.849105Z", "shell.execute_reply": "2023-03-26T15:11:34.848042Z" }, "papermill": { "duration": 1.221242, "end_time": "2023-03-26T15:11:34.852344", "exception": false, "start_time": "2023-03-26T15:11:33.631102", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Importing the libraries\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "import os\n", "import gc, scipy.sparse\n", "from humanize import naturalsize\n", "from sklearn.decomposition import TruncatedSVD" ] }, { "cell_type": "code", "execution_count": 2, "id": "8fb19abf", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T15:11:34.864639Z", "iopub.status.busy": "2023-03-26T15:11:34.863570Z", "iopub.status.idle": "2023-03-26T15:11:47.509689Z", "shell.execute_reply": "2023-03-26T15:11:47.508400Z" }, "papermill": { "duration": 12.655546, "end_time": "2023-03-26T15:11:47.512773", "exception": false, "start_time": "2023-03-26T15:11:34.857227", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\r\n", "\u001b[0m" ] } ], "source": [ "# Need this library to read *.h5 files\n", "!pip install --quiet tables" ] }, { "cell_type": "code", "execution_count": 3, "id": "ef0cfb17", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T15:11:47.525502Z", "iopub.status.busy": "2023-03-26T15:11:47.525034Z", "iopub.status.idle": "2023-03-26T15:11:47.533344Z", "shell.execute_reply": "2023-03-26T15:11:47.532143Z" }, "papermill": { "duration": 0.017817, "end_time": "2023-03-26T15:11:47.535703", "exception": false, "start_time": "2023-03-26T15:11:47.517886", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "DATA_DIR = \"/kaggle/input/open-problems-multimodal/\"\n", "FP_CELL_METADATA = os.path.join(DATA_DIR,\"metadata.csv\")\n", "\n", "FP_CITE_TRAIN_INPUTS = os.path.join(DATA_DIR,\"train_cite_inputs.h5\")\n", "FP_CITE_TRAIN_TARGETS = os.path.join(DATA_DIR,\"train_cite_targets.h5\")\n", "FP_CITE_TEST_INPUTS = os.path.join(DATA_DIR,\"test_cite_inputs.h5\")\n", "\n", "FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,\"train_multi_inputs.h5\")\n", "FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,\"train_multi_targets.h5\")\n", "FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,\"test_multi_inputs.h5\")\n", "\n", "FP_SUBMISSION = os.path.join(DATA_DIR,\"sample_submission.csv\")\n", "FP_EVALUATION_IDS = os.path.join(DATA_DIR,\"evaluation_ids.csv\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "23f2d01a", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T15:11:47.547201Z", "iopub.status.busy": "2023-03-26T15:11:47.546750Z", "iopub.status.idle": "2023-03-26T15:11:47.554602Z", "shell.execute_reply": "2023-03-26T15:11:47.553465Z" }, "papermill": { "duration": 0.016003, "end_time": "2023-03-26T15:11:47.556793", "exception": false, "start_time": "2023-03-26T15:11:47.540790", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# The multiome train dataset raw data is too large to be loaded into RAM. But it is also sparse.\n", "# So, I load the dataset in chunks, and then convert it to sparse matrix.\n", "# Will use this function to do right that.\n", "\n", "\n", "def read_convert_hdf_in_chunks(link, chunk_size, sparse_matrice=None):\n", " i = 0\n", " while i < 1000000:\n", " df_chunk = pd.read_hdf(link, start=i, stop=i+chunk_size)\n", " sparse_chunk = scipy.sparse.csr_matrix(df_chunk.values)\n", " if sparse_matrice == None:\n", " sparse_matrice = sparse_chunk\n", " else:\n", " sparse_matrice = scipy.sparse.vstack([sparse_matrice, sparse_chunk])\n", " print(i)\n", " i += chunk_size\n", " if sparse_chunk.shape[0] < chunk_size:\n", " return sparse_matrice\n", " " ] }, { "cell_type": "markdown", "id": "7faa8030", "metadata": { "papermill": { "duration": 0.004217, "end_time": "2023-03-26T15:11:47.566096", "exception": false, "start_time": "2023-03-26T15:11:47.561879", "status": "completed" }, "tags": [] }, "source": [ "## Process the raw data" ] }, { "cell_type": "code", "execution_count": 5, "id": "6568c30d", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T15:11:47.577155Z", "iopub.status.busy": "2023-03-26T15:11:47.576717Z", "iopub.status.idle": "2023-03-26T15:30:05.975430Z", "shell.execute_reply": "2023-03-26T15:30:05.973817Z" }, "papermill": { "duration": 1098.408278, "end_time": "2023-03-26T15:30:05.979105", "exception": false, "start_time": "2023-03-26T15:11:47.570827", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "5000\n", "10000\n", "15000\n", "20000\n", "25000\n", "30000\n", "35000\n", "40000\n", "45000\n", "50000\n", "55000\n", "60000\n", "65000\n", "70000\n", "75000\n", "80000\n", "85000\n", "90000\n", "95000\n", "100000\n", "105000\n", "105933\n", "CPU times: user 16min 52s, sys: 1min 13s, total: 18min 5s\n", "Wall time: 18min 18s\n" ] }, { "data": { "text/plain": [ "34" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "# Loading raw data inputs\n", "\n", "sparse_X = read_convert_hdf_in_chunks('../input/open-problems-raw-counts/train_multi_inputs_raw.h5', 5000)\n", "print(sparse_X.shape[0])\n", "gc.collect()" ] }, { "cell_type": "code", "execution_count": 6, "id": "802a3bb1", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T15:30:05.993680Z", "iopub.status.busy": "2023-03-26T15:30:05.993289Z", "iopub.status.idle": "2023-03-26T15:39:49.838277Z", "shell.execute_reply": "2023-03-26T15:39:49.837072Z" }, "papermill": { "duration": 583.855343, "end_time": "2023-03-26T15:39:49.840846", "exception": false, "start_time": "2023-03-26T15:30:05.985503", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "5000\n", "10000\n", "15000\n", "20000\n", "25000\n", "30000\n", "35000\n", "40000\n", "45000\n", "50000\n", "55000\n", "161868\n", "CPU times: user 8min 49s, sys: 46.6 s, total: 9min 36s\n", "Wall time: 9min 43s\n" ] }, { "data": { "text/plain": [ "75" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "# Same procedure for the test raw data.\n", "sparse_X = read_convert_hdf_in_chunks('/kaggle/input/open-problems-raw-counts/test_multi_inputs_raw.h5', 5000, sparse_X)\n", "print(sparse_X.shape[0])\n", "gc.collect()" ] }, { "cell_type": "code", "execution_count": 7, "id": "544a6b3f", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T15:39:49.858139Z", "iopub.status.busy": "2023-03-26T15:39:49.856591Z", "iopub.status.idle": "2023-03-26T15:39:50.407194Z", "shell.execute_reply": "2023-03-26T15:39:50.405499Z" }, "papermill": { "duration": 0.563082, "end_time": "2023-03-26T15:39:50.410897", "exception": false, "start_time": "2023-03-26T15:39:49.847815", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Export total_counts. Maybe they will be useful as a feature.\n", "total_counts = sparse_X.sum(axis=1)\n", "counts_index = [*range(len(total_counts))]\n", "total_counts = total_counts.flat\n", "df_total_counts = pd.DataFrame({'total_counts': total_counts}, index=counts_index)\n", "df_total_counts.to_feather('total_counts_multiome.ftr')" ] }, { "cell_type": "code", "execution_count": 8, "id": "1a49d6ef", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T15:39:50.428010Z", "iopub.status.busy": "2023-03-26T15:39:50.427543Z", "iopub.status.idle": "2023-03-26T16:01:28.936016Z", "shell.execute_reply": "2023-03-26T16:01:28.933114Z" }, "papermill": { "duration": 1298.537214, "end_time": "2023-03-26T16:01:28.955652", "exception": false, "start_time": "2023-03-26T15:39:50.418438", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of both before SVD: (161868, 228942)\n", "Shape of both after SVD: (161868, 64)\n", "CPU times: user 21min 46s, sys: 9.47 s, total: 21min 56s\n", "Wall time: 21min 38s\n" ] } ], "source": [ "%%time\n", "# Apply the singular value decomposition.\n", "\n", "print(f\"Shape of both before SVD: {sparse_X.shape}\")\n", "svd = TruncatedSVD(n_components=64, random_state=1)\n", "sparse_X = svd.fit_transform(sparse_X)\n", "print(f\"Shape of both after SVD: {sparse_X.shape}\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "faec350c", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T16:01:28.975382Z", "iopub.status.busy": "2023-03-26T16:01:28.974809Z", "iopub.status.idle": "2023-03-26T16:01:38.711577Z", "shell.execute_reply": "2023-03-26T16:01:38.710063Z" }, "papermill": { "duration": 9.751836, "end_time": "2023-03-26T16:01:38.715236", "exception": false, "start_time": "2023-03-26T16:01:28.963400", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Raw data SVD ready\n" ] } ], "source": [ "# Save results in a file.\n", "df_svd = pd.DataFrame(sparse_X)\n", "df_svd.to_csv('svd_raw.csv')\n", "print('Raw data SVD ready')" ] }, { "cell_type": "code", "execution_count": 10, "id": "b9fb2fac", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T16:01:38.733851Z", "iopub.status.busy": "2023-03-26T16:01:38.733423Z", "iopub.status.idle": "2023-03-26T16:01:38.901106Z", "shell.execute_reply": "2023-03-26T16:01:38.899979Z" }, "papermill": { "duration": 0.179907, "end_time": "2023-03-26T16:01:38.903684", "exception": false, "start_time": "2023-03-26T16:01:38.723777", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "21" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Free the RAM.\n", "del sparse_X, df_svd\n", "gc.collect()" ] }, { "cell_type": "markdown", "id": "c9cf65f9", "metadata": { "papermill": { "duration": 0.007482, "end_time": "2023-03-26T16:01:38.919004", "exception": false, "start_time": "2023-03-26T16:01:38.911522", "status": "completed" }, "tags": [] }, "source": [ "## Process the normalized data" ] }, { "cell_type": "code", "execution_count": 11, "id": "05ae1937", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T16:01:38.935416Z", "iopub.status.busy": "2023-03-26T16:01:38.935020Z", "iopub.status.idle": "2023-03-26T16:23:21.382204Z", "shell.execute_reply": "2023-03-26T16:23:21.380465Z" }, "papermill": { "duration": 1302.458497, "end_time": "2023-03-26T16:23:21.384886", "exception": false, "start_time": "2023-03-26T16:01:38.926389", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "5000\n", "10000\n", "15000\n", "20000\n", "25000\n", "30000\n", "35000\n", "40000\n", "45000\n", "50000\n", "55000\n", "60000\n", "65000\n", "70000\n", "75000\n", "80000\n", "85000\n", "90000\n", "95000\n", "100000\n", "105000\n", "105942\n", "CPU times: user 18min 25s, sys: 1min 25s, total: 19min 51s\n", "Wall time: 21min 42s\n" ] }, { "data": { "text/plain": [ "150" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "# Generally the same operations for the normalized data using the same function.\n", "# Load the train data in chunks and convert it to sparse matrix.\n", "\n", "sparse_X = read_convert_hdf_in_chunks(FP_MULTIOME_TRAIN_INPUTS, 5000)\n", "print(sparse_X.shape[0])\n", "gc.collect()" ] }, { "cell_type": "code", "execution_count": 12, "id": "b729e0e8", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T16:23:21.404437Z", "iopub.status.busy": "2023-03-26T16:23:21.403954Z", "iopub.status.idle": "2023-03-26T16:34:25.339363Z", "shell.execute_reply": "2023-03-26T16:34:25.337535Z" }, "papermill": { "duration": 663.948827, "end_time": "2023-03-26T16:34:25.342225", "exception": false, "start_time": "2023-03-26T16:23:21.393398", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0\n", "5000\n", "10000\n", "15000\n", "20000\n", "25000\n", "30000\n", "35000\n", "40000\n", "45000\n", "50000\n", "55000\n", "161877\n", "CPU times: user 9min 8s, sys: 50.7 s, total: 9min 59s\n", "Wall time: 11min 3s\n" ] }, { "data": { "text/plain": [ "0" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "# Same for normalized test dataset.\n", "\n", "sparse_X = read_convert_hdf_in_chunks(FP_MULTIOME_TEST_INPUTS, 5000, sparse_X)\n", "print(sparse_X.shape[0])\n", "gc.collect()" ] }, { "cell_type": "code", "execution_count": 13, "id": "75155407", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T16:34:25.365178Z", "iopub.status.busy": "2023-03-26T16:34:25.364744Z", "iopub.status.idle": "2023-03-26T17:43:25.015549Z", "shell.execute_reply": "2023-03-26T17:43:25.012447Z" }, "papermill": { "duration": 4139.678463, "end_time": "2023-03-26T17:43:25.030988", "exception": false, "start_time": "2023-03-26T16:34:25.352525", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape of both before SVD: (161877, 228942)\n", "Shape of both after SVD: (161877, 256)\n", "CPU times: user 1h 9min 43s, sys: 29.3 s, total: 1h 10min 13s\n", "Wall time: 1h 8min 59s\n" ] } ], "source": [ "%%time\n", "# Apply the singular value decomposition.\n", "# Normalized data is more important, so I will prepare more components.\n", "\n", "print(f\"Shape of both before SVD: {sparse_X.shape}\")\n", "svd = TruncatedSVD(n_components=256, random_state=1)\n", "sparse_X = svd.fit_transform(sparse_X)\n", "print(f\"Shape of both after SVD: {sparse_X.shape}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "087f142e", "metadata": { "execution": { "iopub.execute_input": "2023-03-26T17:43:25.053242Z", "iopub.status.busy": "2023-03-26T17:43:25.052717Z", "iopub.status.idle": "2023-03-26T17:44:04.388397Z", "shell.execute_reply": "2023-03-26T17:44:04.387211Z" }, "papermill": { "duration": 39.360847, "end_time": "2023-03-26T17:44:04.401455", "exception": false, "start_time": "2023-03-26T17:43:25.040608", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "All the SVD ready\n" ] } ], "source": [ "# Save results in a file.\n", "df_svd = pd.DataFrame(sparse_X)\n", "df_svd.to_csv('svd.csv')\n", "print('All the SVD ready')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "papermill": { "default_parameters": {}, "duration": 9161.84408, "end_time": "2023-03-26T17:44:05.767493", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2023-03-26T15:11:23.923413", "version": "2.3.4" } }, "nbformat": 4, "nbformat_minor": 5 }