|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "code",
|
5 |
| - "execution_count": 1, |
| 5 | + "execution_count": null, |
6 | 6 | "id": "initial_id",
|
7 | 7 | "metadata": {
|
8 |
| - "collapsed": true, |
9 |
| - "ExecuteTime": { |
10 |
| - "end_time": "2024-01-14T18:44:33.991125465Z", |
11 |
| - "start_time": "2024-01-14T18:44:33.689398819Z" |
12 |
| - } |
| 8 | + "collapsed": true |
13 | 9 | },
|
14 | 10 | "outputs": [],
|
15 | 11 | "source": [
|
|
24 | 20 | },
|
25 | 21 | {
|
26 | 22 | "cell_type": "code",
|
27 |
| - "outputs": [ |
28 |
| - { |
29 |
| - "name": "stderr", |
30 |
| - "output_type": "stream", |
31 |
| - "text": [ |
32 |
| - "[nltk_data] Downloading package punkt to /home/maxim/nltk_data...\n", |
33 |
| - "[nltk_data] Package punkt is already up-to-date!\n", |
34 |
| - "[nltk_data] Downloading package omw-1.4 to /home/maxim/nltk_data...\n", |
35 |
| - "[nltk_data] Package omw-1.4 is already up-to-date!\n", |
36 |
| - "[nltk_data] Downloading package wordnet to /home/maxim/nltk_data...\n", |
37 |
| - "[nltk_data] Package wordnet is already up-to-date!\n" |
38 |
| - ] |
39 |
| - }, |
40 |
| - { |
41 |
| - "data": { |
42 |
| - "text/plain": "True" |
43 |
| - }, |
44 |
| - "execution_count": 2, |
45 |
| - "metadata": {}, |
46 |
| - "output_type": "execute_result" |
47 |
| - } |
48 |
| - ], |
| 23 | + "outputs": [], |
49 | 24 | "source": [
|
50 | 25 | "nltk.download('punkt')\n",
|
51 | 26 | "nltk.download('omw-1.4')\n",
|
52 | 27 | "nltk.download('wordnet')"
|
53 | 28 | ],
|
54 | 29 | "metadata": {
|
55 |
| - "collapsed": false, |
56 |
| - "ExecuteTime": { |
57 |
| - "end_time": "2024-01-14T18:44:34.341115279Z", |
58 |
| - "start_time": "2024-01-14T18:44:33.992572197Z" |
59 |
| - } |
| 30 | + "collapsed": false |
60 | 31 | },
|
61 | 32 | "id": "92a102ee9ba336d7",
|
62 |
| - "execution_count": 2 |
| 33 | + "execution_count": null |
63 | 34 | },
|
64 | 35 | {
|
65 | 36 | "cell_type": "code",
|
66 |
| - "execution_count": 3, |
67 |
| - "outputs": [ |
68 |
| - { |
69 |
| - "name": "stdout", |
70 |
| - "output_type": "stream", |
71 |
| - "text": [ |
72 |
| - " label text\n", |
73 |
| - "0 3 Wall St. Bears Claw Back Into the Black (Reute...\n", |
74 |
| - "1 3 Carlyle Looks Toward Commercial Aerospace (Reu...\n", |
75 |
| - "2 3 Oil and Economy Cloud Stocks' Outlook (Reuters...\n", |
76 |
| - "3 3 Iraq Halts Oil Exports from Main Southern Pipe...\n", |
77 |
| - "4 3 Oil prices soar to all-time record, posing new...\n", |
78 |
| - "Carlyle Looks Toward Commercial Aerospace (Reuters). Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.\n" |
79 |
| - ] |
80 |
| - } |
81 |
| - ], |
| 37 | + "execution_count": null, |
| 38 | + "outputs": [], |
82 | 39 | "source": [
|
83 | 40 | "df = pd.read_csv(os.path.join('../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
|
84 | 41 | "df['text'] = (df['Title'] + '. ' + df['Description'])\n",
|
|
87 | 44 | "print(df['text'][1])"
|
88 | 45 | ],
|
89 | 46 | "metadata": {
|
90 |
| - "collapsed": false, |
91 |
| - "ExecuteTime": { |
92 |
| - "end_time": "2024-01-14T18:44:34.665186382Z", |
93 |
| - "start_time": "2024-01-14T18:44:34.342871597Z" |
94 |
| - } |
| 47 | + "collapsed": false |
95 | 48 | },
|
96 | 49 | "id": "10d57caa4259352a"
|
97 | 50 | },
|
98 | 51 | {
|
99 | 52 | "cell_type": "code",
|
100 |
| - "execution_count": 4, |
101 |
| - "outputs": [ |
102 |
| - { |
103 |
| - "data": { |
104 |
| - "text/plain": "0 3\n1 3\n2 3\n3 3\n4 3\n ..\n119995 1\n119996 2\n119997 2\n119998 2\n119999 2\nName: label, Length: 120000, dtype: int64" |
105 |
| - }, |
106 |
| - "execution_count": 4, |
107 |
| - "metadata": {}, |
108 |
| - "output_type": "execute_result" |
109 |
| - } |
110 |
| - ], |
| 53 | + "execution_count": null, |
| 54 | + "outputs": [], |
111 | 55 | "source": [
|
112 | 56 | "df['label']"
|
113 | 57 | ],
|
114 | 58 | "metadata": {
|
115 |
| - "collapsed": false, |
116 |
| - "ExecuteTime": { |
117 |
| - "end_time": "2024-01-14T18:44:34.673196662Z", |
118 |
| - "start_time": "2024-01-14T18:44:34.667446876Z" |
119 |
| - } |
| 59 | + "collapsed": false |
120 | 60 | },
|
121 | 61 | "id": "ad6c6f57643b7841"
|
122 | 62 | },
|
123 | 63 | {
|
124 | 64 | "cell_type": "code",
|
125 |
| - "execution_count": 5, |
| 65 | + "execution_count": null, |
126 | 66 | "outputs": [],
|
127 | 67 | "source": [
|
128 | 68 | "def process_file(df, path):\n",
|
|
147 | 87 | " save_to_file(original, lemmatized, stemmed, os.path.join(str(row['label']), f'{str(counter)}.tsv'), path)"
|
148 | 88 | ],
|
149 | 89 | "metadata": {
|
150 |
| - "collapsed": false, |
151 |
| - "ExecuteTime": { |
152 |
| - "end_time": "2024-01-14T18:44:34.674441301Z", |
153 |
| - "start_time": "2024-01-14T18:44:34.671605126Z" |
154 |
| - } |
| 90 | + "collapsed": false |
155 | 91 | },
|
156 | 92 | "id": "b2f4a0464b92971e"
|
157 | 93 | },
|
158 | 94 | {
|
159 | 95 | "cell_type": "code",
|
160 |
| - "execution_count": 6, |
| 96 | + "execution_count": null, |
161 | 97 | "outputs": [],
|
162 | 98 | "source": [
|
163 | 99 | "df = pd.read_csv(os.path.join('../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
|
|
167 | 103 | "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'train'))"
|
168 | 104 | ],
|
169 | 105 | "metadata": {
|
170 |
| - "collapsed": false, |
171 |
| - "ExecuteTime": { |
172 |
| - "end_time": "2024-01-14T18:45:41.882656959Z", |
173 |
| - "start_time": "2024-01-14T18:44:34.676441732Z" |
174 |
| - } |
| 106 | + "collapsed": false |
175 | 107 | },
|
176 | 108 | "id": "9dc8c43fc3a82676"
|
177 | 109 | },
|
178 | 110 | {
|
179 | 111 | "cell_type": "code",
|
180 |
| - "execution_count": 7, |
| 112 | + "execution_count": null, |
181 | 113 | "outputs": [],
|
182 | 114 | "source": [
|
183 | 115 | "df = pd.read_csv(os.path.join('../dataset/raw/test.csv'), names=['label', 'Title', 'Description'])\n",
|
|
187 | 119 | "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'test'))"
|
188 | 120 | ],
|
189 | 121 | "metadata": {
|
190 |
| - "collapsed": false, |
191 |
| - "ExecuteTime": { |
192 |
| - "end_time": "2024-01-14T18:45:46.209967327Z", |
193 |
| - "start_time": "2024-01-14T18:45:41.882440048Z" |
194 |
| - } |
| 122 | + "collapsed": false |
195 | 123 | },
|
196 | 124 | "id": "65ad549a2d70ff6c"
|
197 | 125 | },
|
|
200 | 128 | "outputs": [],
|
201 | 129 | "source": [],
|
202 | 130 | "metadata": {
|
203 |
| - "collapsed": false, |
204 |
| - "ExecuteTime": { |
205 |
| - "end_time": "2024-01-14T18:45:46.220037002Z", |
206 |
| - "start_time": "2024-01-14T18:45:46.211685403Z" |
207 |
| - } |
| 131 | + "collapsed": false |
208 | 132 | },
|
209 | 133 | "id": "b6b3ae49a9f95763",
|
210 |
| - "execution_count": 7 |
| 134 | + "execution_count": null |
211 | 135 | },
|
212 | 136 | {
|
213 | 137 | "cell_type": "code",
|
|
0 commit comments