Skip to content

Commit 7a3741b

Browse files
committed
Правки ревью
1 parent 4a1c940 commit 7a3741b

File tree

5 files changed

+217
-2524
lines changed

5 files changed

+217
-2524
lines changed

projects/ad-news-classification/lab_1/nlp_1.ipynb

+24-110
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,12 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"id": "initial_id",
77
"metadata": {
8-
"collapsed": true,
9-
"ExecuteTime": {
10-
"end_time": "2023-12-25T11:37:20.112581800Z",
11-
"start_time": "2023-12-25T11:36:52.421353500Z"
12-
}
8+
"collapsed": true
139
},
14-
"outputs": [
15-
{
16-
"name": "stderr",
17-
"output_type": "stream",
18-
"text": [
19-
"[nltk_data] Downloading package punkt to C:\\Users\\Mangarakov\n",
20-
"[nltk_data] Alexandr\\AppData\\Roaming\\nltk_data...\n",
21-
"[nltk_data] Package punkt is already up-to-date!\n",
22-
"[nltk_data] Downloading package omw-1.4 to C:\\Users\\Mangarakov\n",
23-
"[nltk_data] Alexandr\\AppData\\Roaming\\nltk_data...\n",
24-
"[nltk_data] Package omw-1.4 is already up-to-date!\n"
25-
]
26-
}
27-
],
10+
"outputs": [],
2811
"source": [
2912
"import nltk\n",
3013
"\n",
@@ -36,22 +19,8 @@
3619
},
3720
{
3821
"cell_type": "code",
39-
"execution_count": 2,
40-
"outputs": [
41-
{
42-
"name": "stdout",
43-
"output_type": "stream",
44-
"text": [
45-
" label text\n",
46-
"0 3 Wall St. Bears Claw Back Into the Black (Reute...\n",
47-
"1 3 Carlyle Looks Toward Commercial Aerospace (Reu...\n",
48-
"2 3 Oil and Economy Cloud Stocks' Outlook (Reuters...\n",
49-
"3 3 Iraq Halts Oil Exports from Main Southern Pipe...\n",
50-
"4 3 Oil prices soar to all-time record, posing new...\n",
51-
"Carlyle Looks Toward Commercial Aerospace (Reuters). Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.\n"
52-
]
53-
}
54-
],
22+
"execution_count": null,
23+
"outputs": [],
5524
"source": [
5625
"import os\n",
5726
"\n",
@@ -62,42 +31,25 @@
6231
"print(df['text'][1])"
6332
],
6433
"metadata": {
65-
"collapsed": false,
66-
"ExecuteTime": {
67-
"end_time": "2023-12-25T11:37:21.109578500Z",
68-
"start_time": "2023-12-25T11:37:20.120584800Z"
69-
}
34+
"collapsed": false
7035
},
7136
"id": "10d57caa4259352a"
7237
},
7338
{
7439
"cell_type": "code",
75-
"execution_count": 3,
76-
"outputs": [
77-
{
78-
"data": {
79-
"text/plain": "0 3\n1 3\n2 3\n3 3\n4 3\n ..\n119995 1\n119996 2\n119997 2\n119998 2\n119999 2\nName: label, Length: 120000, dtype: int64"
80-
},
81-
"execution_count": 3,
82-
"metadata": {},
83-
"output_type": "execute_result"
84-
}
85-
],
40+
"execution_count": null,
41+
"outputs": [],
8642
"source": [
8743
"df['label']"
8844
],
8945
"metadata": {
90-
"collapsed": false,
91-
"ExecuteTime": {
92-
"end_time": "2023-12-25T11:37:21.207579200Z",
93-
"start_time": "2023-12-25T11:37:21.113583900Z"
94-
}
46+
"collapsed": false
9547
},
9648
"id": "ad6c6f57643b7841"
9749
},
9850
{
9951
"cell_type": "code",
100-
"execution_count": 4,
52+
"execution_count": null,
10153
"outputs": [],
10254
"source": [
10355
"import re\n",
@@ -111,35 +63,27 @@
11163
" return sentences"
11264
],
11365
"metadata": {
114-
"collapsed": false,
115-
"ExecuteTime": {
116-
"end_time": "2023-12-25T11:37:21.209578500Z",
117-
"start_time": "2023-12-25T11:37:21.148579800Z"
118-
}
66+
"collapsed": false
11967
},
12068
"id": "f16b78e23c14172f"
12169
},
12270
{
12371
"cell_type": "code",
124-
"execution_count": 5,
72+
"execution_count": null,
12573
"outputs": [],
12674
"source": [
12775
"def split_to_words(sentence):\n",
12876
" words = re.findall(r\"\\w+@\\w+\\.\\w+|\\+\\d{1,3}-\\d{3}-\\d{3}-\\d{2}-\\d{2}|\\w+\", sentence)\n",
12977
" return words"
13078
],
13179
"metadata": {
132-
"collapsed": false,
133-
"ExecuteTime": {
134-
"end_time": "2023-12-25T11:37:21.211581300Z",
135-
"start_time": "2023-12-25T11:37:21.175580900Z"
136-
}
80+
"collapsed": false
13781
},
13882
"id": "bded99f420ae9022"
13983
},
14084
{
14185
"cell_type": "code",
142-
"execution_count": 6,
86+
"execution_count": null,
14387
"outputs": [],
14488
"source": [
14589
"def save_to_file(original, lemmatized, stemmed, id, path):\n",
@@ -151,28 +95,14 @@
15195
" print(original[i], stemmed[i], lemmatized[i], sep=\"\\t\", file=f)"
15296
],
15397
"metadata": {
154-
"collapsed": false,
155-
"ExecuteTime": {
156-
"end_time": "2023-12-25T11:37:21.328580900Z",
157-
"start_time": "2023-12-25T11:37:21.192582500Z"
158-
}
98+
"collapsed": false
15999
},
160100
"id": "e627613a3632834b"
161101
},
162102
{
163103
"cell_type": "code",
164-
"execution_count": 7,
165-
"outputs": [
166-
{
167-
"name": "stderr",
168-
"output_type": "stream",
169-
"text": [
170-
"[nltk_data] Downloading package wordnet to C:\\Users\\Mangarakov\n",
171-
"[nltk_data] Alexandr\\AppData\\Roaming\\nltk_data...\n",
172-
"[nltk_data] Package wordnet is already up-to-date!\n"
173-
]
174-
}
175-
],
104+
"execution_count": null,
105+
"outputs": [],
176106
"source": [
177107
"from nltk import WordNetLemmatizer\n",
178108
"from nltk import SnowballStemmer\n",
@@ -202,31 +132,23 @@
202132
" save_to_file(original, lemmatized, stemmed, os.path.join(str(row['label']), f'{str(counter)}.tsv'), path)"
203133
],
204134
"metadata": {
205-
"collapsed": false,
206-
"ExecuteTime": {
207-
"end_time": "2023-12-25T11:37:21.331775800Z",
208-
"start_time": "2023-12-25T11:37:21.226582500Z"
209-
}
135+
"collapsed": false
210136
},
211137
"id": "b2f4a0464b92971e"
212138
},
213139
{
214140
"cell_type": "code",
215-
"execution_count": 7,
141+
"execution_count": null,
216142
"outputs": [],
217143
"source": [],
218144
"metadata": {
219-
"collapsed": false,
220-
"ExecuteTime": {
221-
"end_time": "2023-12-25T11:37:21.361308100Z",
222-
"start_time": "2023-12-25T11:37:21.326584800Z"
223-
}
145+
"collapsed": false
224146
},
225147
"id": "eae86957a0812dbe"
226148
},
227149
{
228150
"cell_type": "code",
229-
"execution_count": 8,
151+
"execution_count": null,
230152
"outputs": [],
231153
"source": [
232154
"df = pd.read_csv(os.path.join('../../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
@@ -236,17 +158,13 @@
236158
"process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'train'))"
237159
],
238160
"metadata": {
239-
"collapsed": false,
240-
"ExecuteTime": {
241-
"end_time": "2023-12-25T11:45:31.318174500Z",
242-
"start_time": "2023-12-25T11:37:21.350309100Z"
243-
}
161+
"collapsed": false
244162
},
245163
"id": "9dc8c43fc3a82676"
246164
},
247165
{
248166
"cell_type": "code",
249-
"execution_count": 9,
167+
"execution_count": null,
250168
"outputs": [],
251169
"source": [
252170
"df = pd.read_csv(os.path.join('../../dataset/raw/test.csv'), names=['label', 'Title', 'Description'])\n",
@@ -256,11 +174,7 @@
256174
"process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'test'))"
257175
],
258176
"metadata": {
259-
"collapsed": false,
260-
"ExecuteTime": {
261-
"end_time": "2023-12-25T11:45:58.557854700Z",
262-
"start_time": "2023-12-25T11:45:31.328178900Z"
263-
}
177+
"collapsed": false
264178
},
265179
"id": "65ad549a2d70ff6c"
266180
},

0 commit comments

Comments
 (0)