|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 1, |
| 5 | + "execution_count": 22, |
| 6 | + "metadata": {}, |
| 7 | + "outputs": [], |
| 8 | + "source": [ |
| 9 | + "%%capture\n", |
| 10 | + "!pip install fcd" |
| 11 | + ] |
| 12 | + }, |
| 13 | + { |
| 14 | + "cell_type": "code", |
| 15 | + "execution_count": 23, |
| 16 | + "metadata": {}, |
| 17 | + "outputs": [ |
| 18 | + { |
| 19 | + "name": "stdout", |
| 20 | + "output_type": "stream", |
| 21 | + "text": [ |
| 22 | + "--2024-04-01 18:53:15-- https://raw.githubusercontent.com/bioinf-jku/FCD/master/generated_smiles/LSTM_Segler.smi\n", |
| 23 | + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...\n", |
| 24 | + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.\n", |
| 25 | + "HTTP request sent, awaiting response... 200 OK\n", |
| 26 | + "Length: 22730454 (22M) [text/plain]\n", |
| 27 | + "Saving to: ‘generated_smiles/LSTM_Segler.smi’\n", |
| 28 | + "\n", |
| 29 | + "generated_smiles/LS 100%[===================>] 21.68M 3.07MB/s in 7.3s \n", |
| 30 | + "\n", |
| 31 | + "2024-04-01 18:53:22 (2.98 MB/s) - ‘generated_smiles/LSTM_Segler.smi’ saved [22730454/22730454]\n", |
| 32 | + "\n" |
| 33 | + ] |
| 34 | + } |
| 35 | + ], |
| 36 | + "source": [ |
| 37 | + "!mkdir generated_smiles -p\n", |
| 38 | + "!wget https://raw.githubusercontent.com/bioinf-jku/FCD/master/generated_smiles/LSTM_Segler.smi -O generated_smiles/LSTM_Segler.smi" |
| 39 | + ] |
| 40 | + }, |
| 41 | + { |
| 42 | + "cell_type": "code", |
| 43 | + "execution_count": 24, |
6 | 44 | "metadata": {},
|
7 | 45 | "outputs": [],
|
8 | 46 | "source": [
|
9 | 47 | "import os\n",
|
10 |
| - "from rdkit import RDLogger \n", |
| 48 | + "from rdkit import RDLogger\n", |
11 | 49 | "import numpy as np\n",
|
12 |
| - "import pandas as pd\n", |
13 |
| - "from fcd import get_fcd, load_ref_model,canonical_smiles, get_predictions, calculate_frechet_distance\n", |
14 | 50 | "\n",
|
15 |
| - "RDLogger.DisableLog('rdApp.*')\n", |
| 51 | + "from fcd import get_fcd, load_ref_model, canonical_smiles, get_predictions, calculate_frechet_distance\n", |
| 52 | + "\n", |
| 53 | + "RDLogger.DisableLog(\"rdApp.*\")\n", |
16 | 54 | "\n",
|
17 | 55 | "np.random.seed(0)\n",
|
18 |
| - "os.environ[\"CUDA_VISIBLE_DEVICES\"]= '0' #set gpu" |
| 56 | + "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\" # set gpu" |
19 | 57 | ]
|
20 | 58 | },
|
21 | 59 | {
|
|
31 | 69 | },
|
32 | 70 | {
|
33 | 71 | "cell_type": "code",
|
34 |
| - "execution_count": 2, |
| 72 | + "execution_count": 25, |
35 | 73 | "metadata": {
|
36 | 74 | "ExecuteTime": {
|
37 | 75 | "end_time": "2020-04-23T09:13:50.403933Z",
|
|
44 | 82 | "model = load_ref_model()\n",
|
45 | 83 | "\n",
|
46 | 84 | "# Load generated molecules\n",
|
47 |
| - "gen_mol_file = \"generated_smiles/LSTM_Segler.smi\" #input file which contains one generated SMILES per line\n", |
48 |
| - "gen_mol = pd.read_csv(gen_mol_file,header=None)[0] #IMPORTANT: take at least 10000 molecules as FCD can vary with sample size \n", |
| 85 | + "gen_mol_file = \"generated_smiles/LSTM_Segler.smi\" # input file which contains one generated SMILES per line\n", |
| 86 | + "with open(gen_mol_file) as f:\n", |
| 87 | + " gen_mol = f.read().split(\"\\n\")\n", |
| 88 | + "\n", |
| 89 | + "# IMPORTANT: take at least 10000 molecules as FCD can vary with sample size\n", |
49 | 90 | "sample1 = np.random.choice(gen_mol, 10000, replace=False)\n",
|
50 | 91 | "sample2 = np.random.choice(gen_mol, 10000, replace=False)\n",
|
51 | 92 | "\n",
|
|
65 | 106 | },
|
66 | 107 | {
|
67 | 108 | "cell_type": "code",
|
68 |
| - "execution_count": 3, |
| 109 | + "execution_count": 26, |
69 | 110 | "metadata": {
|
70 | 111 | "ExecuteTime": {
|
71 | 112 | "end_time": "2020-04-23T09:11:27.207953Z",
|
|
77 | 118 | "name": "stdout",
|
78 | 119 | "output_type": "stream",
|
79 | 120 | "text": [
|
80 |
| - "FCD: 0.333862289051325\n" |
| 121 | + "FCD: 0.3298386855756661\n" |
81 | 122 | ]
|
82 | 123 | }
|
83 | 124 | ],
|
84 | 125 | "source": [
|
85 |
| - "#get CHEBMLNET activations of generated molecules \n", |
| 126 | + "# get CHEBMLNET activations of generated molecules\n", |
86 | 127 | "act1 = get_predictions(model, can_sample1)\n",
|
87 | 128 | "act2 = get_predictions(model, can_sample2)\n",
|
88 | 129 | "\n",
|
|
92 | 133 | "mu2 = np.mean(act2, axis=0)\n",
|
93 | 134 | "sigma2 = np.cov(act2.T)\n",
|
94 | 135 | "\n",
|
95 |
| - "fcd_score = calculate_frechet_distance(\n", |
96 |
| - " mu1=mu1,\n", |
97 |
| - " mu2=mu2, \n", |
98 |
| - " sigma1=sigma1,\n", |
99 |
| - " sigma2=sigma2)\n", |
| 136 | + "fcd_score = calculate_frechet_distance(mu1=mu1, mu2=mu2, sigma1=sigma1, sigma2=sigma2)\n", |
100 | 137 | "\n",
|
101 |
| - "print('FCD: ',fcd_score)" |
| 138 | + "print(\"FCD: \", fcd_score)" |
102 | 139 | ]
|
103 | 140 | },
|
104 | 141 | {
|
105 | 142 | "cell_type": "code",
|
106 |
| - "execution_count": 4, |
| 143 | + "execution_count": 27, |
107 | 144 | "metadata": {
|
108 | 145 | "ExecuteTime": {
|
109 | 146 | "end_time": "2020-04-23T09:11:38.873496Z",
|
|
115 | 152 | "name": "stdout",
|
116 | 153 | "output_type": "stream",
|
117 | 154 | "text": [
|
118 |
| - "FCD: 0.333862289051325\n" |
| 155 | + "FCD: 0.3298386855756661\n" |
119 | 156 | ]
|
120 | 157 | }
|
121 | 158 | ],
|
122 | 159 | "source": [
|
123 | 160 | "\"\"\"if you don't need to store the activations you can also take a shortcut.\"\"\"\n",
|
124 | 161 | "fcd_score = get_fcd(can_sample1, can_sample2, model)\n",
|
125 | 162 | "\n",
|
126 |
| - "print('FCD: ',fcd_score)" |
| 163 | + "print(\"FCD: \", fcd_score)" |
127 | 164 | ]
|
128 | 165 | },
|
129 | 166 | {
|
130 | 167 | "cell_type": "code",
|
131 |
| - "execution_count": 5, |
| 168 | + "execution_count": 28, |
132 | 169 | "metadata": {
|
133 | 170 | "ExecuteTime": {
|
134 | 171 | "end_time": "2020-04-23T09:11:49.760022Z",
|
|
140 | 177 | "name": "stdout",
|
141 | 178 | "output_type": "stream",
|
142 | 179 | "text": [
|
143 |
| - "FCD: 25.635578193222216\n" |
| 180 | + "FCD: 25.552174526889033\n" |
144 | 181 | ]
|
145 | 182 | }
|
146 | 183 | ],
|
147 | 184 | "source": [
|
148 | 185 | "\"\"\"This is what happens if you do not canonicalize the smiles\"\"\"\n",
|
149 | 186 | "fcd_score = get_fcd(can_sample1, sample2, model)\n",
|
150 |
| - "print('FCD: ',fcd_score)" |
| 187 | + "print(\"FCD: \", fcd_score)" |
151 | 188 | ]
|
152 | 189 | }
|
153 | 190 | ],
|
|
0 commit comments