Skip to content

Commit 7714314

Browse files
committed
Clearing data
1 parent 853f1cd commit 7714314

File tree

7 files changed

+180
-6122
lines changed

7 files changed

+180
-6122
lines changed

projects/bm-news-classification/lab_1/nlp_1.ipynb

+9-65
Original file line numberDiff line numberDiff line change
@@ -24,61 +24,22 @@
2424
},
2525
{
2626
"cell_type": "code",
27-
"outputs": [
28-
{
29-
"name": "stderr",
30-
"output_type": "stream",
31-
"text": [
32-
"[nltk_data] Downloading package punkt to /home/maxim/nltk_data...\n",
33-
"[nltk_data] Package punkt is already up-to-date!\n",
34-
"[nltk_data] Downloading package omw-1.4 to /home/maxim/nltk_data...\n",
35-
"[nltk_data] Package omw-1.4 is already up-to-date!\n",
36-
"[nltk_data] Downloading package wordnet to /home/maxim/nltk_data...\n",
37-
"[nltk_data] Package wordnet is already up-to-date!\n"
38-
]
39-
},
40-
{
41-
"data": {
42-
"text/plain": "True"
43-
},
44-
"execution_count": 2,
45-
"metadata": {},
46-
"output_type": "execute_result"
47-
}
48-
],
27+
"outputs": [],
4928
"source": [
5029
"nltk.download('punkt')\n",
5130
"nltk.download('omw-1.4')\n",
5231
"nltk.download('wordnet')"
5332
],
5433
"metadata": {
55-
"collapsed": false,
56-
"ExecuteTime": {
57-
"end_time": "2024-01-14T18:44:34.341115279Z",
58-
"start_time": "2024-01-14T18:44:33.992572197Z"
59-
}
34+
"collapsed": false
6035
},
6136
"id": "92a102ee9ba336d7",
62-
"execution_count": 2
37+
"execution_count": null
6338
},
6439
{
6540
"cell_type": "code",
66-
"execution_count": 3,
67-
"outputs": [
68-
{
69-
"name": "stdout",
70-
"output_type": "stream",
71-
"text": [
72-
" label text\n",
73-
"0 3 Wall St. Bears Claw Back Into the Black (Reute...\n",
74-
"1 3 Carlyle Looks Toward Commercial Aerospace (Reu...\n",
75-
"2 3 Oil and Economy Cloud Stocks' Outlook (Reuters...\n",
76-
"3 3 Iraq Halts Oil Exports from Main Southern Pipe...\n",
77-
"4 3 Oil prices soar to all-time record, posing new...\n",
78-
"Carlyle Looks Toward Commercial Aerospace (Reuters). Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.\n"
79-
]
80-
}
81-
],
41+
"execution_count": null,
42+
"outputs": [],
8243
"source": [
8344
"df = pd.read_csv(os.path.join('../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
8445
"df['text'] = (df['Title'] + '. ' + df['Description'])\n",
@@ -87,36 +48,19 @@
8748
"print(df['text'][1])"
8849
],
8950
"metadata": {
90-
"collapsed": false,
91-
"ExecuteTime": {
92-
"end_time": "2024-01-14T18:44:34.665186382Z",
93-
"start_time": "2024-01-14T18:44:34.342871597Z"
94-
}
51+
"collapsed": false
9552
},
9653
"id": "10d57caa4259352a"
9754
},
9855
{
9956
"cell_type": "code",
100-
"execution_count": 4,
101-
"outputs": [
102-
{
103-
"data": {
104-
"text/plain": "0 3\n1 3\n2 3\n3 3\n4 3\n ..\n119995 1\n119996 2\n119997 2\n119998 2\n119999 2\nName: label, Length: 120000, dtype: int64"
105-
},
106-
"execution_count": 4,
107-
"metadata": {},
108-
"output_type": "execute_result"
109-
}
110-
],
57+
"execution_count": null,
58+
"outputs": [],
11159
"source": [
11260
"df['label']"
11361
],
11462
"metadata": {
115-
"collapsed": false,
116-
"ExecuteTime": {
117-
"end_time": "2024-01-14T18:44:34.673196662Z",
118-
"start_time": "2024-01-14T18:44:34.667446876Z"
119-
}
63+
"collapsed": false
12064
},
12165
"id": "ad6c6f57643b7841"
12266
},

projects/bm-news-classification/lab_2/nlp_2.ipynb

+24-136
Original file line numberDiff line numberDiff line change
@@ -23,34 +23,13 @@
2323
},
2424
{
2525
"cell_type": "code",
26-
"execution_count": 2,
27-
"outputs": [
28-
{
29-
"name": "stderr",
30-
"output_type": "stream",
31-
"text": [
32-
"[nltk_data] Downloading package stopwords to /home/maxim/nltk_data...\n",
33-
"[nltk_data] Package stopwords is already up-to-date!\n"
34-
]
35-
},
36-
{
37-
"data": {
38-
"text/plain": "True"
39-
},
40-
"execution_count": 2,
41-
"metadata": {},
42-
"output_type": "execute_result"
43-
}
44-
],
26+
"execution_count": null,
27+
"outputs": [],
4528
"source": [
4629
"nltk.download('stopwords')"
4730
],
4831
"metadata": {
49-
"collapsed": false,
50-
"ExecuteTime": {
51-
"end_time": "2024-01-14T18:45:55.509530195Z",
52-
"start_time": "2024-01-14T18:45:54.790887051Z"
53-
}
32+
"collapsed": false
5433
},
5534
"id": "f4b26fb355c7206"
5635
},
@@ -122,26 +101,13 @@
122101
},
123102
{
124103
"cell_type": "code",
125-
"execution_count": 6,
126-
"outputs": [
127-
{
128-
"data": {
129-
"text/plain": "16277"
130-
},
131-
"execution_count": 6,
132-
"metadata": {},
133-
"output_type": "execute_result"
134-
}
135-
],
104+
"execution_count": null,
105+
"outputs": [],
136106
"source": [
137107
"len(sentences)"
138108
],
139109
"metadata": {
140-
"collapsed": false,
141-
"ExecuteTime": {
142-
"end_time": "2024-01-14T18:46:11.924326539Z",
143-
"start_time": "2024-01-14T18:46:11.921388541Z"
144-
}
110+
"collapsed": false
145111
},
146112
"id": "8ee431edefba4092"
147113
},
@@ -196,102 +162,50 @@
196162
},
197163
{
198164
"cell_type": "code",
199-
"execution_count": 9,
200-
"outputs": [
201-
{
202-
"data": {
203-
"text/plain": "174818"
204-
},
205-
"execution_count": 9,
206-
"metadata": {},
207-
"output_type": "execute_result"
208-
}
209-
],
165+
"execution_count": null,
166+
"outputs": [],
210167
"source": [
211168
"len(ngrams)"
212169
],
213170
"metadata": {
214-
"collapsed": false,
215-
"ExecuteTime": {
216-
"end_time": "2024-01-14T18:46:12.130476655Z",
217-
"start_time": "2024-01-14T18:46:12.087454830Z"
218-
}
171+
"collapsed": false
219172
},
220173
"id": "645d2bf792d524e2"
221174
},
222175
{
223176
"cell_type": "code",
224-
"execution_count": 10,
225-
"outputs": [
226-
{
227-
"data": {
228-
"text/plain": "[('39', 2802),\n ('new', 1349),\n ('said', 1312),\n ('has', 1219),\n ('reuter', 1148),\n ('ap', 1038),\n ('year', 948),\n ('was', 855),\n ('us', 850),\n ('gt', 760),\n ('lt', 753),\n ('quot', 700),\n ('two', 672),\n ('compani', 660),\n ('first', 657),\n ('say', 554),\n ('one', 549),\n ('world', 536),\n ('report', 532),\n ('u', 511),\n ('monday', 510),\n ('game', 509),\n ('tuesday', 505),\n ('1', 472),\n ('state', 466),\n ('thursday', 466),\n ('win', 461),\n ('wednesday', 453),\n ('inc', 449),\n ('2', 448)]"
229-
},
230-
"execution_count": 10,
231-
"metadata": {},
232-
"output_type": "execute_result"
233-
}
234-
],
177+
"execution_count": null,
178+
"outputs": [],
235179
"source": [
236180
"sorted(word_count.items(), key=lambda x: -x[1])[:30]"
237181
],
238182
"metadata": {
239-
"collapsed": false,
240-
"ExecuteTime": {
241-
"end_time": "2024-01-14T18:46:12.131759006Z",
242-
"start_time": "2024-01-14T18:46:12.130167068Z"
243-
}
183+
"collapsed": false
244184
},
245185
"id": "6439c66a95bedb34"
246186
},
247187
{
248188
"cell_type": "code",
249-
"execution_count": 11,
250-
"outputs": [
251-
{
252-
"data": {
253-
"text/plain": "[(('lt', 'b', 'gt'), 348),\n (('b', 'gt', 'lt'), 174),\n (('gt', 'lt', 'b'), 174),\n (('new', 'york', 'reuter'), 141),\n (('lt', 'href', 'http'), 114),\n (('href', 'http', 'www'), 110),\n (('http', 'www', 'investor'), 96),\n (('www', 'investor', 'reuter'), 96),\n (('investor', 'reuter', 'com'), 96),\n (('reuter', 'com', 'fullquot'), 96),\n (('com', 'fullquot', 'aspx'), 96),\n (('fullquot', 'aspx', 'ticker'), 96),\n (('target', 'stock', 'quickinfo'), 96),\n (('stock', 'quickinfo', 'fullquot'), 96),\n (('quickinfo', 'fullquot', 'gt'), 96),\n (('n', 'lt', 'gt'), 72),\n (('quot', 'profil', 'research'), 71),\n (('n', 'target', 'stock'), 70),\n (('lt', 'p', 'gt'), 66),\n (('inc', 'lt', 'href'), 58),\n (('n', 'quot', 'profil'), 40),\n (('boston', 'red', 'sox'), 39),\n (('gt', 'lt', 'font'), 35),\n (('p', 'gt', 'lt'), 33),\n (('york', 'reuter', 'u'), 33),\n (('gt', 'lt', 'p'), 31),\n (('presid', 'vladimir', 'putin'), 25),\n (('lt', 'font', 'face'), 23),\n (('font', 'face', 'verdana'), 23),\n (('face', 'verdana', 'san'), 23)]"
254-
},
255-
"execution_count": 11,
256-
"metadata": {},
257-
"output_type": "execute_result"
258-
}
259-
],
189+
"execution_count": null,
190+
"outputs": [],
260191
"source": [
261192
"sorted(ngrams_count.items(), key=lambda x: -x[1])[:30]"
262193
],
263194
"metadata": {
264-
"collapsed": false,
265-
"ExecuteTime": {
266-
"end_time": "2024-01-14T18:46:12.132584370Z",
267-
"start_time": "2024-01-14T18:46:12.130441799Z"
268-
}
195+
"collapsed": false
269196
},
270197
"id": "dccccee88e69fef6"
271198
},
272199
{
273200
"cell_type": "code",
274-
"execution_count": 12,
275-
"outputs": [
276-
{
277-
"data": {
278-
"text/plain": "207236"
279-
},
280-
"execution_count": 12,
281-
"metadata": {},
282-
"output_type": "execute_result"
283-
}
284-
],
201+
"execution_count": null,
202+
"outputs": [],
285203
"source": [
286204
"total_words = sum(word_count.values())\n",
287205
"total_words"
288206
],
289207
"metadata": {
290-
"collapsed": false,
291-
"ExecuteTime": {
292-
"end_time": "2024-01-14T18:46:12.133664187Z",
293-
"start_time": "2024-01-14T18:46:12.130696893Z"
294-
}
208+
"collapsed": false
295209
},
296210
"id": "57340fb3a953ba"
297211
},
@@ -309,17 +223,8 @@
309223
},
310224
{
311225
"cell_type": "code",
312-
"execution_count": 13,
313-
"outputs": [
314-
{
315-
"data": {
316-
"text/plain": "[(('mcteer', 'lonesom', 'dove'), 35.321830233977494),\n (('exot', 'melaleuca', 'iguana'), 35.321830233977494),\n (('lonesom', 'dove', 'aggi'), 35.321830233977494),\n (('terin', 'humphrey', 'annia'), 35.321830233977494),\n (('thelma', 'drake', 'norfolk'), 35.321830233977494),\n (('nr', 'narayana', 'murthi'), 35.321830233977494),\n (('unsign', 'adewal', 'ogunley'), 35.321830233977494),\n (('ellen', 'zane', 'oversaw'), 35.321830233977494),\n (('mou', 'tamanthi', 'hydroelectr'), 35.321830233977494),\n (('laserjet', '4345mfp', 'multifunct'), 35.321830233977494),\n (('sarwan', 'shivnarin', 'chanderpaul'), 35.321830233977494),\n (('781', '442', '0750'), 35.321830233977494),\n (('drool', 'alt', 'rocker'), 35.321830233977494),\n (('binti', 'pengiran', 'salleh'), 35.321830233977494),\n (('suitor', 'foodland', 'foa'), 35.321830233977494),\n (('bb', 'lob', 'avg'), 35.321830233977494),\n (('nokiajoinssecuredigitalindustrygroup', '2100', '1039_3'),\n 35.321830233977494),\n (('troi', 'rivier', 'que'), 35.321830233977494),\n (('4345mfp', 'multifunct', 'copier'), 35.321830233977494),\n (('intravascular', 'coagul', 'dic'), 35.321830233977494),\n (('munzala', 'arunach', 'macaqu'), 35.321830233977494),\n (('mp3s', 'blatant', 'disregard'), 35.321830233977494),\n (('guidug', 'guh', 'doo'), 35.321830233977494),\n (('kandanski', '781', '442'), 35.321830233977494),\n (('azahari', 'noordin', 'moh'), 35.321830233977494),\n (('blatant', 'disregard', 'hilari'), 35.321830233977494),\n (('bink', 'lookalik', 'gungan'), 35.321830233977494),\n (('jo', 'wilfri', 'tsonga'), 35.321830233977494),\n (('tung', 'chee', 'hwa'), 35.321830233977494),\n (('macaca', 'munzala', 'arunach'), 35.321830233977494),\n (('middleborough', 'middleboro', 'cobra'), 35.321830233977494),\n (('jane', 'westborough', 'woke'), 35.321830233977494),\n (('netinfomanag', 'postfix', 'serveradmin'), 35.321830233977494),\n (('loren', 'galler', 'rabinowitz'), 35.321830233977494),\n (('petroliam', 'nasion', 'bhd'), 35.321830233977494),\n (('ramnaresh', 'sarwan', 'shivnarin'), 35.321830233977494),\n (('fourier', 'spectromet', 'pfs'), 35.321830233977494),\n (('inver', 'caledonian', 'thistl'), 35.321830233977494),\n (('cna', 'academia', 'sinica'), 35.321830233977494),\n (('2100', '1039_3', '5365922'), 35.321830233977494),\n (('yu', 'shyi', 'kun'), 35.321830233977494),\n (('klien', 'vitantonio', 'liuzzi'), 35.321830233977494),\n (('folger', 'espresso', 'dunkin'), 35.321830233977494),\n (('olympiqu', 'marseill', '1993'), 35.321830233977494),\n (('fsb', 'fud', 'foi'), 35.321830233977494),\n (('ku', 'klux', 'klan'), 35.321830233977494),\n (('shadi', 'nook', 'cranni'), 35.321830233977494),\n (('pickoff', 'cutoff', 'bunt'), 35.321830233977494),\n (('gino', 'guidug', 'guh'), 35.321830233977494),\n (('palett', 'pastel', 'hue'), 35.321830233977494),\n (('sher', 'bahadur', 'deuba'), 35.321830233977494),\n (('humidor', 'darth', 'vader'), 35.321830233977494),\n (('nesn', 'weei', 'lhp'), 35.321830233977494),\n (('jsp', 'storyid', '53949'), 34.321830233977494),\n (('headshak', 'seti', 'headfak'), 34.321830233977494),\n (('mk', 'matan', 'vilnai'), 34.321830233977494),\n (('junki', 'whet', 'appetit'), 34.321830233977494),\n (('jar', 'bink', 'lookalik'), 34.321830233977494),\n (('fewest', 'numbest', 'unearn'), 34.321830233977494),\n (('ah', 'jaffor', 'ullah'), 34.321830233977494)]"
317-
},
318-
"execution_count": 13,
319-
"metadata": {},
320-
"output_type": "execute_result"
321-
}
322-
],
226+
"execution_count": null,
227+
"outputs": [],
323228
"source": [
324229
"ngram_score = {}\n",
325230
"for ngram in set(ngrams):\n",
@@ -330,27 +235,14 @@
330235
"sorted(ngram_score.items(), key=lambda x: -x[1])[0:60]"
331236
],
332237
"metadata": {
333-
"collapsed": false,
334-
"ExecuteTime": {
335-
"end_time": "2024-01-14T18:46:12.563721516Z",
336-
"start_time": "2024-01-14T18:46:12.195696235Z"
337-
}
238+
"collapsed": false
338239
},
339240
"id": "239eade116446b8a"
340241
},
341242
{
342243
"cell_type": "code",
343-
"execution_count": 14,
344-
"outputs": [
345-
{
346-
"data": {
347-
"text/plain": "[('1913', 'doesnt', 'clearcut'),\n ('1x1', 'ord', '200301151450'),\n ('2100', '1039_3', '5365922'),\n ('4345mfp', 'multifunct', 'copier'),\n ('563', 'kph', 'vampir'),\n ('781', '442', '0750'),\n ('azahari', 'noordin', 'moh'),\n ('bb', 'lob', 'avg'),\n ('bink', 'lookalik', 'gungan'),\n ('binti', 'pengiran', 'salleh'),\n ('blatant', 'disregard', 'hilari'),\n ('cna', 'academia', 'sinica'),\n ('drool', 'alt', 'rocker'),\n ('ellen', 'zane', 'oversaw'),\n ('exot', 'melaleuca', 'iguana'),\n ('folger', 'espresso', 'dunkin'),\n ('fourier', 'spectromet', 'pfs'),\n ('fsb', 'fud', 'foi'),\n ('gino', 'guidug', 'guh'),\n ('guidug', 'guh', 'doo'),\n ('humidor', 'darth', 'vader'),\n ('intravascular', 'coagul', 'dic'),\n ('inver', 'caledonian', 'thistl'),\n ('jane', 'westborough', 'woke'),\n ('jarkko', 'nieminen', 'overpow'),\n ('jo', 'wilfri', 'tsonga'),\n ('kandanski', '781', '442'),\n ('klien', 'vitantonio', 'liuzzi'),\n ('ku', 'klux', 'klan'),\n ('laserjet', '4345mfp', 'multifunct')]"
348-
},
349-
"execution_count": 14,
350-
"metadata": {},
351-
"output_type": "execute_result"
352-
}
353-
],
244+
"execution_count": null,
245+
"outputs": [],
354246
"source": [
355247
"text = []\n",
356248
"for sentence in sentences:\n",
@@ -359,11 +251,7 @@
359251
"finder.nbest(nltk.collocations.TrigramAssocMeasures().mi_like, 30)"
360252
],
361253
"metadata": {
362-
"collapsed": false,
363-
"ExecuteTime": {
364-
"end_time": "2024-01-14T18:46:13.647509678Z",
365-
"start_time": "2024-01-14T18:46:12.574914843Z"
366-
}
254+
"collapsed": false
367255
},
368256
"id": "7fa7f548f73d5ff4"
369257
}

0 commit comments

Comments
 (0)