Skip to content

Commit 91e4d1c

Browse files
committed
Clearing data
1 parent 853f1cd commit 91e4d1c

File tree

7 files changed

+438
-78232
lines changed

7 files changed

+438
-78232
lines changed

projects/bm-news-classification/lab_1/nlp_1.ipynb

+19-95
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,10 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"id": "initial_id",
77
"metadata": {
8-
"collapsed": true,
9-
"ExecuteTime": {
10-
"end_time": "2024-01-14T18:44:33.991125465Z",
11-
"start_time": "2024-01-14T18:44:33.689398819Z"
12-
}
8+
"collapsed": true
139
},
1410
"outputs": [],
1511
"source": [
@@ -24,61 +20,22 @@
2420
},
2521
{
2622
"cell_type": "code",
27-
"outputs": [
28-
{
29-
"name": "stderr",
30-
"output_type": "stream",
31-
"text": [
32-
"[nltk_data] Downloading package punkt to /home/maxim/nltk_data...\n",
33-
"[nltk_data] Package punkt is already up-to-date!\n",
34-
"[nltk_data] Downloading package omw-1.4 to /home/maxim/nltk_data...\n",
35-
"[nltk_data] Package omw-1.4 is already up-to-date!\n",
36-
"[nltk_data] Downloading package wordnet to /home/maxim/nltk_data...\n",
37-
"[nltk_data] Package wordnet is already up-to-date!\n"
38-
]
39-
},
40-
{
41-
"data": {
42-
"text/plain": "True"
43-
},
44-
"execution_count": 2,
45-
"metadata": {},
46-
"output_type": "execute_result"
47-
}
48-
],
23+
"outputs": [],
4924
"source": [
5025
"nltk.download('punkt')\n",
5126
"nltk.download('omw-1.4')\n",
5227
"nltk.download('wordnet')"
5328
],
5429
"metadata": {
55-
"collapsed": false,
56-
"ExecuteTime": {
57-
"end_time": "2024-01-14T18:44:34.341115279Z",
58-
"start_time": "2024-01-14T18:44:33.992572197Z"
59-
}
30+
"collapsed": false
6031
},
6132
"id": "92a102ee9ba336d7",
62-
"execution_count": 2
33+
"execution_count": null
6334
},
6435
{
6536
"cell_type": "code",
66-
"execution_count": 3,
67-
"outputs": [
68-
{
69-
"name": "stdout",
70-
"output_type": "stream",
71-
"text": [
72-
" label text\n",
73-
"0 3 Wall St. Bears Claw Back Into the Black (Reute...\n",
74-
"1 3 Carlyle Looks Toward Commercial Aerospace (Reu...\n",
75-
"2 3 Oil and Economy Cloud Stocks' Outlook (Reuters...\n",
76-
"3 3 Iraq Halts Oil Exports from Main Southern Pipe...\n",
77-
"4 3 Oil prices soar to all-time record, posing new...\n",
78-
"Carlyle Looks Toward Commercial Aerospace (Reuters). Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.\n"
79-
]
80-
}
81-
],
37+
"execution_count": null,
38+
"outputs": [],
8239
"source": [
8340
"df = pd.read_csv(os.path.join('../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
8441
"df['text'] = (df['Title'] + '. ' + df['Description'])\n",
@@ -87,42 +44,25 @@
8744
"print(df['text'][1])"
8845
],
8946
"metadata": {
90-
"collapsed": false,
91-
"ExecuteTime": {
92-
"end_time": "2024-01-14T18:44:34.665186382Z",
93-
"start_time": "2024-01-14T18:44:34.342871597Z"
94-
}
47+
"collapsed": false
9548
},
9649
"id": "10d57caa4259352a"
9750
},
9851
{
9952
"cell_type": "code",
100-
"execution_count": 4,
101-
"outputs": [
102-
{
103-
"data": {
104-
"text/plain": "0 3\n1 3\n2 3\n3 3\n4 3\n ..\n119995 1\n119996 2\n119997 2\n119998 2\n119999 2\nName: label, Length: 120000, dtype: int64"
105-
},
106-
"execution_count": 4,
107-
"metadata": {},
108-
"output_type": "execute_result"
109-
}
110-
],
53+
"execution_count": null,
54+
"outputs": [],
11155
"source": [
11256
"df['label']"
11357
],
11458
"metadata": {
115-
"collapsed": false,
116-
"ExecuteTime": {
117-
"end_time": "2024-01-14T18:44:34.673196662Z",
118-
"start_time": "2024-01-14T18:44:34.667446876Z"
119-
}
59+
"collapsed": false
12060
},
12161
"id": "ad6c6f57643b7841"
12262
},
12363
{
12464
"cell_type": "code",
125-
"execution_count": 5,
65+
"execution_count": null,
12666
"outputs": [],
12767
"source": [
12868
"def process_file(df, path):\n",
@@ -147,17 +87,13 @@
14787
" save_to_file(original, lemmatized, stemmed, os.path.join(str(row['label']), f'{str(counter)}.tsv'), path)"
14888
],
14989
"metadata": {
150-
"collapsed": false,
151-
"ExecuteTime": {
152-
"end_time": "2024-01-14T18:44:34.674441301Z",
153-
"start_time": "2024-01-14T18:44:34.671605126Z"
154-
}
90+
"collapsed": false
15591
},
15692
"id": "b2f4a0464b92971e"
15793
},
15894
{
15995
"cell_type": "code",
160-
"execution_count": 6,
96+
"execution_count": null,
16197
"outputs": [],
16298
"source": [
16399
"df = pd.read_csv(os.path.join('../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
@@ -167,17 +103,13 @@
167103
"process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'train'))"
168104
],
169105
"metadata": {
170-
"collapsed": false,
171-
"ExecuteTime": {
172-
"end_time": "2024-01-14T18:45:41.882656959Z",
173-
"start_time": "2024-01-14T18:44:34.676441732Z"
174-
}
106+
"collapsed": false
175107
},
176108
"id": "9dc8c43fc3a82676"
177109
},
178110
{
179111
"cell_type": "code",
180-
"execution_count": 7,
112+
"execution_count": null,
181113
"outputs": [],
182114
"source": [
183115
"df = pd.read_csv(os.path.join('../dataset/raw/test.csv'), names=['label', 'Title', 'Description'])\n",
@@ -187,11 +119,7 @@
187119
"process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'test'))"
188120
],
189121
"metadata": {
190-
"collapsed": false,
191-
"ExecuteTime": {
192-
"end_time": "2024-01-14T18:45:46.209967327Z",
193-
"start_time": "2024-01-14T18:45:41.882440048Z"
194-
}
122+
"collapsed": false
195123
},
196124
"id": "65ad549a2d70ff6c"
197125
},
@@ -200,14 +128,10 @@
200128
"outputs": [],
201129
"source": [],
202130
"metadata": {
203-
"collapsed": false,
204-
"ExecuteTime": {
205-
"end_time": "2024-01-14T18:45:46.220037002Z",
206-
"start_time": "2024-01-14T18:45:46.211685403Z"
207-
}
131+
"collapsed": false
208132
},
209133
"id": "b6b3ae49a9f95763",
210-
"execution_count": 7
134+
"execution_count": null
211135
},
212136
{
213137
"cell_type": "code",

0 commit comments

Comments
 (0)