|
23 | 23 | },
|
24 | 24 | {
|
25 | 25 | "cell_type": "code",
|
26 |
| - "execution_count": 2, |
27 |
| - "outputs": [ |
28 |
| - { |
29 |
| - "name": "stderr", |
30 |
| - "output_type": "stream", |
31 |
| - "text": [ |
32 |
| - "[nltk_data] Downloading package stopwords to /home/maxim/nltk_data...\n", |
33 |
| - "[nltk_data] Package stopwords is already up-to-date!\n" |
34 |
| - ] |
35 |
| - }, |
36 |
| - { |
37 |
| - "data": { |
38 |
| - "text/plain": "True" |
39 |
| - }, |
40 |
| - "execution_count": 2, |
41 |
| - "metadata": {}, |
42 |
| - "output_type": "execute_result" |
43 |
| - } |
44 |
| - ], |
| 26 | + "execution_count": null, |
| 27 | + "outputs": [], |
45 | 28 | "source": [
|
46 | 29 | "nltk.download('stopwords')"
|
47 | 30 | ],
|
48 | 31 | "metadata": {
|
49 |
| - "collapsed": false, |
50 |
| - "ExecuteTime": { |
51 |
| - "end_time": "2024-01-14T18:45:55.509530195Z", |
52 |
| - "start_time": "2024-01-14T18:45:54.790887051Z" |
53 |
| - } |
| 32 | + "collapsed": false |
54 | 33 | },
|
55 | 34 | "id": "f4b26fb355c7206"
|
56 | 35 | },
|
|
122 | 101 | },
|
123 | 102 | {
|
124 | 103 | "cell_type": "code",
|
125 |
| - "execution_count": 6, |
126 |
| - "outputs": [ |
127 |
| - { |
128 |
| - "data": { |
129 |
| - "text/plain": "16277" |
130 |
| - }, |
131 |
| - "execution_count": 6, |
132 |
| - "metadata": {}, |
133 |
| - "output_type": "execute_result" |
134 |
| - } |
135 |
| - ], |
| 104 | + "execution_count": null, |
| 105 | + "outputs": [], |
136 | 106 | "source": [
|
137 | 107 | "len(sentences)"
|
138 | 108 | ],
|
139 | 109 | "metadata": {
|
140 |
| - "collapsed": false, |
141 |
| - "ExecuteTime": { |
142 |
| - "end_time": "2024-01-14T18:46:11.924326539Z", |
143 |
| - "start_time": "2024-01-14T18:46:11.921388541Z" |
144 |
| - } |
| 110 | + "collapsed": false |
145 | 111 | },
|
146 | 112 | "id": "8ee431edefba4092"
|
147 | 113 | },
|
|
196 | 162 | },
|
197 | 163 | {
|
198 | 164 | "cell_type": "code",
|
199 |
| - "execution_count": 9, |
200 |
| - "outputs": [ |
201 |
| - { |
202 |
| - "data": { |
203 |
| - "text/plain": "174818" |
204 |
| - }, |
205 |
| - "execution_count": 9, |
206 |
| - "metadata": {}, |
207 |
| - "output_type": "execute_result" |
208 |
| - } |
209 |
| - ], |
| 165 | + "execution_count": null, |
| 166 | + "outputs": [], |
210 | 167 | "source": [
|
211 | 168 | "len(ngrams)"
|
212 | 169 | ],
|
213 | 170 | "metadata": {
|
214 |
| - "collapsed": false, |
215 |
| - "ExecuteTime": { |
216 |
| - "end_time": "2024-01-14T18:46:12.130476655Z", |
217 |
| - "start_time": "2024-01-14T18:46:12.087454830Z" |
218 |
| - } |
| 171 | + "collapsed": false |
219 | 172 | },
|
220 | 173 | "id": "645d2bf792d524e2"
|
221 | 174 | },
|
222 | 175 | {
|
223 | 176 | "cell_type": "code",
|
224 |
| - "execution_count": 10, |
225 |
| - "outputs": [ |
226 |
| - { |
227 |
| - "data": { |
228 |
| - "text/plain": "[('39', 2802),\n ('new', 1349),\n ('said', 1312),\n ('has', 1219),\n ('reuter', 1148),\n ('ap', 1038),\n ('year', 948),\n ('was', 855),\n ('us', 850),\n ('gt', 760),\n ('lt', 753),\n ('quot', 700),\n ('two', 672),\n ('compani', 660),\n ('first', 657),\n ('say', 554),\n ('one', 549),\n ('world', 536),\n ('report', 532),\n ('u', 511),\n ('monday', 510),\n ('game', 509),\n ('tuesday', 505),\n ('1', 472),\n ('state', 466),\n ('thursday', 466),\n ('win', 461),\n ('wednesday', 453),\n ('inc', 449),\n ('2', 448)]" |
229 |
| - }, |
230 |
| - "execution_count": 10, |
231 |
| - "metadata": {}, |
232 |
| - "output_type": "execute_result" |
233 |
| - } |
234 |
| - ], |
| 177 | + "execution_count": null, |
| 178 | + "outputs": [], |
235 | 179 | "source": [
|
236 | 180 | "sorted(word_count.items(), key=lambda x: -x[1])[:30]"
|
237 | 181 | ],
|
238 | 182 | "metadata": {
|
239 |
| - "collapsed": false, |
240 |
| - "ExecuteTime": { |
241 |
| - "end_time": "2024-01-14T18:46:12.131759006Z", |
242 |
| - "start_time": "2024-01-14T18:46:12.130167068Z" |
243 |
| - } |
| 183 | + "collapsed": false |
244 | 184 | },
|
245 | 185 | "id": "6439c66a95bedb34"
|
246 | 186 | },
|
247 | 187 | {
|
248 | 188 | "cell_type": "code",
|
249 |
| - "execution_count": 11, |
250 |
| - "outputs": [ |
251 |
| - { |
252 |
| - "data": { |
253 |
| - "text/plain": "[(('lt', 'b', 'gt'), 348),\n (('b', 'gt', 'lt'), 174),\n (('gt', 'lt', 'b'), 174),\n (('new', 'york', 'reuter'), 141),\n (('lt', 'href', 'http'), 114),\n (('href', 'http', 'www'), 110),\n (('http', 'www', 'investor'), 96),\n (('www', 'investor', 'reuter'), 96),\n (('investor', 'reuter', 'com'), 96),\n (('reuter', 'com', 'fullquot'), 96),\n (('com', 'fullquot', 'aspx'), 96),\n (('fullquot', 'aspx', 'ticker'), 96),\n (('target', 'stock', 'quickinfo'), 96),\n (('stock', 'quickinfo', 'fullquot'), 96),\n (('quickinfo', 'fullquot', 'gt'), 96),\n (('n', 'lt', 'gt'), 72),\n (('quot', 'profil', 'research'), 71),\n (('n', 'target', 'stock'), 70),\n (('lt', 'p', 'gt'), 66),\n (('inc', 'lt', 'href'), 58),\n (('n', 'quot', 'profil'), 40),\n (('boston', 'red', 'sox'), 39),\n (('gt', 'lt', 'font'), 35),\n (('p', 'gt', 'lt'), 33),\n (('york', 'reuter', 'u'), 33),\n (('gt', 'lt', 'p'), 31),\n (('presid', 'vladimir', 'putin'), 25),\n (('lt', 'font', 'face'), 23),\n (('font', 'face', 'verdana'), 23),\n (('face', 'verdana', 'san'), 23)]" |
254 |
| - }, |
255 |
| - "execution_count": 11, |
256 |
| - "metadata": {}, |
257 |
| - "output_type": "execute_result" |
258 |
| - } |
259 |
| - ], |
| 189 | + "execution_count": null, |
| 190 | + "outputs": [], |
260 | 191 | "source": [
|
261 | 192 | "sorted(ngrams_count.items(), key=lambda x: -x[1])[:30]"
|
262 | 193 | ],
|
263 | 194 | "metadata": {
|
264 |
| - "collapsed": false, |
265 |
| - "ExecuteTime": { |
266 |
| - "end_time": "2024-01-14T18:46:12.132584370Z", |
267 |
| - "start_time": "2024-01-14T18:46:12.130441799Z" |
268 |
| - } |
| 195 | + "collapsed": false |
269 | 196 | },
|
270 | 197 | "id": "dccccee88e69fef6"
|
271 | 198 | },
|
272 | 199 | {
|
273 | 200 | "cell_type": "code",
|
274 |
| - "execution_count": 12, |
275 |
| - "outputs": [ |
276 |
| - { |
277 |
| - "data": { |
278 |
| - "text/plain": "207236" |
279 |
| - }, |
280 |
| - "execution_count": 12, |
281 |
| - "metadata": {}, |
282 |
| - "output_type": "execute_result" |
283 |
| - } |
284 |
| - ], |
| 201 | + "execution_count": null, |
| 202 | + "outputs": [], |
285 | 203 | "source": [
|
286 | 204 | "total_words = sum(word_count.values())\n",
|
287 | 205 | "total_words"
|
288 | 206 | ],
|
289 | 207 | "metadata": {
|
290 |
| - "collapsed": false, |
291 |
| - "ExecuteTime": { |
292 |
| - "end_time": "2024-01-14T18:46:12.133664187Z", |
293 |
| - "start_time": "2024-01-14T18:46:12.130696893Z" |
294 |
| - } |
| 208 | + "collapsed": false |
295 | 209 | },
|
296 | 210 | "id": "57340fb3a953ba"
|
297 | 211 | },
|
|
309 | 223 | },
|
310 | 224 | {
|
311 | 225 | "cell_type": "code",
|
312 |
| - "execution_count": 13, |
313 |
| - "outputs": [ |
314 |
| - { |
315 |
| - "data": { |
316 |
| - "text/plain": "[(('mcteer', 'lonesom', 'dove'), 35.321830233977494),\n (('exot', 'melaleuca', 'iguana'), 35.321830233977494),\n (('lonesom', 'dove', 'aggi'), 35.321830233977494),\n (('terin', 'humphrey', 'annia'), 35.321830233977494),\n (('thelma', 'drake', 'norfolk'), 35.321830233977494),\n (('nr', 'narayana', 'murthi'), 35.321830233977494),\n (('unsign', 'adewal', 'ogunley'), 35.321830233977494),\n (('ellen', 'zane', 'oversaw'), 35.321830233977494),\n (('mou', 'tamanthi', 'hydroelectr'), 35.321830233977494),\n (('laserjet', '4345mfp', 'multifunct'), 35.321830233977494),\n (('sarwan', 'shivnarin', 'chanderpaul'), 35.321830233977494),\n (('781', '442', '0750'), 35.321830233977494),\n (('drool', 'alt', 'rocker'), 35.321830233977494),\n (('binti', 'pengiran', 'salleh'), 35.321830233977494),\n (('suitor', 'foodland', 'foa'), 35.321830233977494),\n (('bb', 'lob', 'avg'), 35.321830233977494),\n (('nokiajoinssecuredigitalindustrygroup', '2100', '1039_3'),\n 35.321830233977494),\n (('troi', 'rivier', 'que'), 35.321830233977494),\n (('4345mfp', 'multifunct', 'copier'), 35.321830233977494),\n (('intravascular', 'coagul', 'dic'), 35.321830233977494),\n (('munzala', 'arunach', 'macaqu'), 35.321830233977494),\n (('mp3s', 'blatant', 'disregard'), 35.321830233977494),\n (('guidug', 'guh', 'doo'), 35.321830233977494),\n (('kandanski', '781', '442'), 35.321830233977494),\n (('azahari', 'noordin', 'moh'), 35.321830233977494),\n (('blatant', 'disregard', 'hilari'), 35.321830233977494),\n (('bink', 'lookalik', 'gungan'), 35.321830233977494),\n (('jo', 'wilfri', 'tsonga'), 35.321830233977494),\n (('tung', 'chee', 'hwa'), 35.321830233977494),\n (('macaca', 'munzala', 'arunach'), 35.321830233977494),\n (('middleborough', 'middleboro', 'cobra'), 35.321830233977494),\n (('jane', 'westborough', 'woke'), 35.321830233977494),\n (('netinfomanag', 'postfix', 'serveradmin'), 35.321830233977494),\n (('loren', 'galler', 'rabinowitz'), 35.321830233977494),\n (('petroliam', 'nasion', 'bhd'), 35.321830233977494),\n (('ramnaresh', 'sarwan', 'shivnarin'), 35.321830233977494),\n (('fourier', 'spectromet', 'pfs'), 35.321830233977494),\n (('inver', 'caledonian', 'thistl'), 35.321830233977494),\n (('cna', 'academia', 'sinica'), 35.321830233977494),\n (('2100', '1039_3', '5365922'), 35.321830233977494),\n (('yu', 'shyi', 'kun'), 35.321830233977494),\n (('klien', 'vitantonio', 'liuzzi'), 35.321830233977494),\n (('folger', 'espresso', 'dunkin'), 35.321830233977494),\n (('olympiqu', 'marseill', '1993'), 35.321830233977494),\n (('fsb', 'fud', 'foi'), 35.321830233977494),\n (('ku', 'klux', 'klan'), 35.321830233977494),\n (('shadi', 'nook', 'cranni'), 35.321830233977494),\n (('pickoff', 'cutoff', 'bunt'), 35.321830233977494),\n (('gino', 'guidug', 'guh'), 35.321830233977494),\n (('palett', 'pastel', 'hue'), 35.321830233977494),\n (('sher', 'bahadur', 'deuba'), 35.321830233977494),\n (('humidor', 'darth', 'vader'), 35.321830233977494),\n (('nesn', 'weei', 'lhp'), 35.321830233977494),\n (('jsp', 'storyid', '53949'), 34.321830233977494),\n (('headshak', 'seti', 'headfak'), 34.321830233977494),\n (('mk', 'matan', 'vilnai'), 34.321830233977494),\n (('junki', 'whet', 'appetit'), 34.321830233977494),\n (('jar', 'bink', 'lookalik'), 34.321830233977494),\n (('fewest', 'numbest', 'unearn'), 34.321830233977494),\n (('ah', 'jaffor', 'ullah'), 34.321830233977494)]" |
317 |
| - }, |
318 |
| - "execution_count": 13, |
319 |
| - "metadata": {}, |
320 |
| - "output_type": "execute_result" |
321 |
| - } |
322 |
| - ], |
| 226 | + "execution_count": null, |
| 227 | + "outputs": [], |
323 | 228 | "source": [
|
324 | 229 | "ngram_score = {}\n",
|
325 | 230 | "for ngram in set(ngrams):\n",
|
|
330 | 235 | "sorted(ngram_score.items(), key=lambda x: -x[1])[0:60]"
|
331 | 236 | ],
|
332 | 237 | "metadata": {
|
333 |
| - "collapsed": false, |
334 |
| - "ExecuteTime": { |
335 |
| - "end_time": "2024-01-14T18:46:12.563721516Z", |
336 |
| - "start_time": "2024-01-14T18:46:12.195696235Z" |
337 |
| - } |
| 238 | + "collapsed": false |
338 | 239 | },
|
339 | 240 | "id": "239eade116446b8a"
|
340 | 241 | },
|
341 | 242 | {
|
342 | 243 | "cell_type": "code",
|
343 |
| - "execution_count": 14, |
344 |
| - "outputs": [ |
345 |
| - { |
346 |
| - "data": { |
347 |
| - "text/plain": "[('1913', 'doesnt', 'clearcut'),\n ('1x1', 'ord', '200301151450'),\n ('2100', '1039_3', '5365922'),\n ('4345mfp', 'multifunct', 'copier'),\n ('563', 'kph', 'vampir'),\n ('781', '442', '0750'),\n ('azahari', 'noordin', 'moh'),\n ('bb', 'lob', 'avg'),\n ('bink', 'lookalik', 'gungan'),\n ('binti', 'pengiran', 'salleh'),\n ('blatant', 'disregard', 'hilari'),\n ('cna', 'academia', 'sinica'),\n ('drool', 'alt', 'rocker'),\n ('ellen', 'zane', 'oversaw'),\n ('exot', 'melaleuca', 'iguana'),\n ('folger', 'espresso', 'dunkin'),\n ('fourier', 'spectromet', 'pfs'),\n ('fsb', 'fud', 'foi'),\n ('gino', 'guidug', 'guh'),\n ('guidug', 'guh', 'doo'),\n ('humidor', 'darth', 'vader'),\n ('intravascular', 'coagul', 'dic'),\n ('inver', 'caledonian', 'thistl'),\n ('jane', 'westborough', 'woke'),\n ('jarkko', 'nieminen', 'overpow'),\n ('jo', 'wilfri', 'tsonga'),\n ('kandanski', '781', '442'),\n ('klien', 'vitantonio', 'liuzzi'),\n ('ku', 'klux', 'klan'),\n ('laserjet', '4345mfp', 'multifunct')]" |
348 |
| - }, |
349 |
| - "execution_count": 14, |
350 |
| - "metadata": {}, |
351 |
| - "output_type": "execute_result" |
352 |
| - } |
353 |
| - ], |
| 244 | + "execution_count": null, |
| 245 | + "outputs": [], |
354 | 246 | "source": [
|
355 | 247 | "text = []\n",
|
356 | 248 | "for sentence in sentences:\n",
|
|
359 | 251 | "finder.nbest(nltk.collocations.TrigramAssocMeasures().mi_like, 30)"
|
360 | 252 | ],
|
361 | 253 | "metadata": {
|
362 |
| - "collapsed": false, |
363 |
| - "ExecuteTime": { |
364 |
| - "end_time": "2024-01-14T18:46:13.647509678Z", |
365 |
| - "start_time": "2024-01-14T18:46:12.574914843Z" |
366 |
| - } |
| 254 | + "collapsed": false |
367 | 255 | },
|
368 | 256 | "id": "7fa7f548f73d5ff4"
|
369 | 257 | }
|
|
0 commit comments