|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 1, |
| 5 | + "execution_count": null, |
6 | 6 | "id": "initial_id",
|
7 | 7 | "metadata": {
|
8 |
| - "collapsed": true, |
9 |
| - "ExecuteTime": { |
10 |
| - "end_time": "2023-12-25T11:37:20.112581800Z", |
11 |
| - "start_time": "2023-12-25T11:36:52.421353500Z" |
12 |
| - } |
| 8 | + "collapsed": true |
13 | 9 | },
|
14 |
| - "outputs": [ |
15 |
| - { |
16 |
| - "name": "stderr", |
17 |
| - "output_type": "stream", |
18 |
| - "text": [ |
19 |
| - "[nltk_data] Downloading package punkt to C:\\Users\\Mangarakov\n", |
20 |
| - "[nltk_data] Alexandr\\AppData\\Roaming\\nltk_data...\n", |
21 |
| - "[nltk_data] Package punkt is already up-to-date!\n", |
22 |
| - "[nltk_data] Downloading package omw-1.4 to C:\\Users\\Mangarakov\n", |
23 |
| - "[nltk_data] Alexandr\\AppData\\Roaming\\nltk_data...\n", |
24 |
| - "[nltk_data] Package omw-1.4 is already up-to-date!\n" |
25 |
| - ] |
26 |
| - } |
27 |
| - ], |
| 10 | + "outputs": [], |
28 | 11 | "source": [
|
29 | 12 | "import nltk\n",
|
30 | 13 | "\n",
|
|
36 | 19 | },
|
37 | 20 | {
|
38 | 21 | "cell_type": "code",
|
39 |
| - "execution_count": 2, |
40 |
| - "outputs": [ |
41 |
| - { |
42 |
| - "name": "stdout", |
43 |
| - "output_type": "stream", |
44 |
| - "text": [ |
45 |
| - " label text\n", |
46 |
| - "0 3 Wall St. Bears Claw Back Into the Black (Reute...\n", |
47 |
| - "1 3 Carlyle Looks Toward Commercial Aerospace (Reu...\n", |
48 |
| - "2 3 Oil and Economy Cloud Stocks' Outlook (Reuters...\n", |
49 |
| - "3 3 Iraq Halts Oil Exports from Main Southern Pipe...\n", |
50 |
| - "4 3 Oil prices soar to all-time record, posing new...\n", |
51 |
| - "Carlyle Looks Toward Commercial Aerospace (Reuters). Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.\n" |
52 |
| - ] |
53 |
| - } |
54 |
| - ], |
| 22 | + "execution_count": null, |
| 23 | + "outputs": [], |
55 | 24 | "source": [
|
56 | 25 | "import os\n",
|
57 | 26 | "\n",
|
|
62 | 31 | "print(df['text'][1])"
|
63 | 32 | ],
|
64 | 33 | "metadata": {
|
65 |
| - "collapsed": false, |
66 |
| - "ExecuteTime": { |
67 |
| - "end_time": "2023-12-25T11:37:21.109578500Z", |
68 |
| - "start_time": "2023-12-25T11:37:20.120584800Z" |
69 |
| - } |
| 34 | + "collapsed": false |
70 | 35 | },
|
71 | 36 | "id": "10d57caa4259352a"
|
72 | 37 | },
|
73 | 38 | {
|
74 | 39 | "cell_type": "code",
|
75 |
| - "execution_count": 3, |
76 |
| - "outputs": [ |
77 |
| - { |
78 |
| - "data": { |
79 |
| - "text/plain": "0 3\n1 3\n2 3\n3 3\n4 3\n ..\n119995 1\n119996 2\n119997 2\n119998 2\n119999 2\nName: label, Length: 120000, dtype: int64" |
80 |
| - }, |
81 |
| - "execution_count": 3, |
82 |
| - "metadata": {}, |
83 |
| - "output_type": "execute_result" |
84 |
| - } |
85 |
| - ], |
| 40 | + "execution_count": null, |
| 41 | + "outputs": [], |
86 | 42 | "source": [
|
87 | 43 | "df['label']"
|
88 | 44 | ],
|
89 | 45 | "metadata": {
|
90 |
| - "collapsed": false, |
91 |
| - "ExecuteTime": { |
92 |
| - "end_time": "2023-12-25T11:37:21.207579200Z", |
93 |
| - "start_time": "2023-12-25T11:37:21.113583900Z" |
94 |
| - } |
| 46 | + "collapsed": false |
95 | 47 | },
|
96 | 48 | "id": "ad6c6f57643b7841"
|
97 | 49 | },
|
98 | 50 | {
|
99 | 51 | "cell_type": "code",
|
100 |
| - "execution_count": 4, |
| 52 | + "execution_count": null, |
101 | 53 | "outputs": [],
|
102 | 54 | "source": [
|
103 | 55 | "import re\n",
|
|
111 | 63 | " return sentences"
|
112 | 64 | ],
|
113 | 65 | "metadata": {
|
114 |
| - "collapsed": false, |
115 |
| - "ExecuteTime": { |
116 |
| - "end_time": "2023-12-25T11:37:21.209578500Z", |
117 |
| - "start_time": "2023-12-25T11:37:21.148579800Z" |
118 |
| - } |
| 66 | + "collapsed": false |
119 | 67 | },
|
120 | 68 | "id": "f16b78e23c14172f"
|
121 | 69 | },
|
122 | 70 | {
|
123 | 71 | "cell_type": "code",
|
124 |
| - "execution_count": 5, |
| 72 | + "execution_count": null, |
125 | 73 | "outputs": [],
|
126 | 74 | "source": [
|
127 | 75 | "def split_to_words(sentence):\n",
|
128 | 76 | " words = re.findall(r\"\\w+@\\w+\\.\\w+|\\+\\d{1,3}-\\d{3}-\\d{3}-\\d{2}-\\d{2}|\\w+\", sentence)\n",
|
129 | 77 | " return words"
|
130 | 78 | ],
|
131 | 79 | "metadata": {
|
132 |
| - "collapsed": false, |
133 |
| - "ExecuteTime": { |
134 |
| - "end_time": "2023-12-25T11:37:21.211581300Z", |
135 |
| - "start_time": "2023-12-25T11:37:21.175580900Z" |
136 |
| - } |
| 80 | + "collapsed": false |
137 | 81 | },
|
138 | 82 | "id": "bded99f420ae9022"
|
139 | 83 | },
|
140 | 84 | {
|
141 | 85 | "cell_type": "code",
|
142 |
| - "execution_count": 6, |
| 86 | + "execution_count": null, |
143 | 87 | "outputs": [],
|
144 | 88 | "source": [
|
145 | 89 | "def save_to_file(original, lemmatized, stemmed, id, path):\n",
|
|
151 | 95 | " print(original[i], stemmed[i], lemmatized[i], sep=\"\\t\", file=f)"
|
152 | 96 | ],
|
153 | 97 | "metadata": {
|
154 |
| - "collapsed": false, |
155 |
| - "ExecuteTime": { |
156 |
| - "end_time": "2023-12-25T11:37:21.328580900Z", |
157 |
| - "start_time": "2023-12-25T11:37:21.192582500Z" |
158 |
| - } |
| 98 | + "collapsed": false |
159 | 99 | },
|
160 | 100 | "id": "e627613a3632834b"
|
161 | 101 | },
|
162 | 102 | {
|
163 | 103 | "cell_type": "code",
|
164 |
| - "execution_count": 7, |
165 |
| - "outputs": [ |
166 |
| - { |
167 |
| - "name": "stderr", |
168 |
| - "output_type": "stream", |
169 |
| - "text": [ |
170 |
| - "[nltk_data] Downloading package wordnet to C:\\Users\\Mangarakov\n", |
171 |
| - "[nltk_data] Alexandr\\AppData\\Roaming\\nltk_data...\n", |
172 |
| - "[nltk_data] Package wordnet is already up-to-date!\n" |
173 |
| - ] |
174 |
| - } |
175 |
| - ], |
| 104 | + "execution_count": null, |
| 105 | + "outputs": [], |
176 | 106 | "source": [
|
177 | 107 | "from nltk import WordNetLemmatizer\n",
|
178 | 108 | "from nltk import SnowballStemmer\n",
|
|
202 | 132 | " save_to_file(original, lemmatized, stemmed, os.path.join(str(row['label']), f'{str(counter)}.tsv'), path)"
|
203 | 133 | ],
|
204 | 134 | "metadata": {
|
205 |
| - "collapsed": false, |
206 |
| - "ExecuteTime": { |
207 |
| - "end_time": "2023-12-25T11:37:21.331775800Z", |
208 |
| - "start_time": "2023-12-25T11:37:21.226582500Z" |
209 |
| - } |
| 135 | + "collapsed": false |
210 | 136 | },
|
211 | 137 | "id": "b2f4a0464b92971e"
|
212 | 138 | },
|
213 | 139 | {
|
214 | 140 | "cell_type": "code",
|
215 |
| - "execution_count": 7, |
| 141 | + "execution_count": null, |
216 | 142 | "outputs": [],
|
217 | 143 | "source": [],
|
218 | 144 | "metadata": {
|
219 |
| - "collapsed": false, |
220 |
| - "ExecuteTime": { |
221 |
| - "end_time": "2023-12-25T11:37:21.361308100Z", |
222 |
| - "start_time": "2023-12-25T11:37:21.326584800Z" |
223 |
| - } |
| 145 | + "collapsed": false |
224 | 146 | },
|
225 | 147 | "id": "eae86957a0812dbe"
|
226 | 148 | },
|
227 | 149 | {
|
228 | 150 | "cell_type": "code",
|
229 |
| - "execution_count": 8, |
| 151 | + "execution_count": null, |
230 | 152 | "outputs": [],
|
231 | 153 | "source": [
|
232 | 154 | "df = pd.read_csv(os.path.join('../../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
|
|
236 | 158 | "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'train'))"
|
237 | 159 | ],
|
238 | 160 | "metadata": {
|
239 |
| - "collapsed": false, |
240 |
| - "ExecuteTime": { |
241 |
| - "end_time": "2023-12-25T11:45:31.318174500Z", |
242 |
| - "start_time": "2023-12-25T11:37:21.350309100Z" |
243 |
| - } |
| 161 | + "collapsed": false |
244 | 162 | },
|
245 | 163 | "id": "9dc8c43fc3a82676"
|
246 | 164 | },
|
247 | 165 | {
|
248 | 166 | "cell_type": "code",
|
249 |
| - "execution_count": 9, |
| 167 | + "execution_count": null, |
250 | 168 | "outputs": [],
|
251 | 169 | "source": [
|
252 | 170 | "df = pd.read_csv(os.path.join('../../dataset/raw/test.csv'), names=['label', 'Title', 'Description'])\n",
|
|
256 | 174 | "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'test'))"
|
257 | 175 | ],
|
258 | 176 | "metadata": {
|
259 |
| - "collapsed": false, |
260 |
| - "ExecuteTime": { |
261 |
| - "end_time": "2023-12-25T11:45:58.557854700Z", |
262 |
| - "start_time": "2023-12-25T11:45:31.328178900Z" |
263 |
| - } |
| 177 | + "collapsed": false |
264 | 178 | },
|
265 | 179 | "id": "65ad549a2d70ff6c"
|
266 | 180 | },
|
|
0 commit comments