|
4 | 4 | "cell_type": "code",
|
5 | 5 | "execution_count": null,
|
6 | 6 | "id": "initial_id",
|
7 |
| - "metadata": { |
8 |
| - "collapsed": true |
9 |
| - }, |
| 7 | + "metadata": {}, |
10 | 8 | "outputs": [],
|
11 | 9 | "source": [
|
12 | 10 | "import nltk\n",
|
|
20 | 18 | {
|
21 | 19 | "cell_type": "code",
|
22 | 20 | "execution_count": null,
|
| 21 | + "id": "10d57caa4259352a", |
| 22 | + "metadata": {}, |
23 | 23 | "outputs": [],
|
24 | 24 | "source": [
|
25 | 25 | "import os\n",
|
|
29 | 29 | "df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
|
30 | 30 | "print(df.head())\n",
|
31 | 31 | "print(df['text'][1])"
|
32 |
| - ], |
33 |
| - "metadata": { |
34 |
| - "collapsed": false |
35 |
| - }, |
36 |
| - "id": "10d57caa4259352a" |
| 32 | + ] |
37 | 33 | },
|
38 | 34 | {
|
39 | 35 | "cell_type": "code",
|
40 | 36 | "execution_count": null,
|
| 37 | + "id": "ad6c6f57643b7841", |
| 38 | + "metadata": {}, |
41 | 39 | "outputs": [],
|
42 | 40 | "source": [
|
43 | 41 | "df['label']"
|
44 |
| - ], |
45 |
| - "metadata": { |
46 |
| - "collapsed": false |
47 |
| - }, |
48 |
| - "id": "ad6c6f57643b7841" |
| 42 | + ] |
49 | 43 | },
|
50 | 44 | {
|
51 | 45 | "cell_type": "code",
|
52 | 46 | "execution_count": null,
|
| 47 | + "id": "f16b78e23c14172f", |
| 48 | + "metadata": {}, |
53 | 49 | "outputs": [],
|
54 | 50 | "source": [
|
55 | 51 | "import re\n",
|
|
61 | 57 | " sentences = re.split(\n",
|
62 | 58 | " r\"(((?<!\\w\\.\\w.)(?<!\\s\\w\\.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?|\\!)\\s(?=[A-Z]))|((?<![\\,\\-\\:])\\n(?=[A-Z]|\\\" )))\", text)[::4]\n",
|
63 | 59 | " return sentences"
|
64 |
| - ], |
65 |
| - "metadata": { |
66 |
| - "collapsed": false |
67 |
| - }, |
68 |
| - "id": "f16b78e23c14172f" |
| 60 | + ] |
69 | 61 | },
|
70 | 62 | {
|
71 | 63 | "cell_type": "code",
|
72 | 64 | "execution_count": null,
|
| 65 | + "id": "bded99f420ae9022", |
| 66 | + "metadata": {}, |
73 | 67 | "outputs": [],
|
74 | 68 | "source": [
|
75 | 69 | "def split_to_words(sentence):\n",
|
76 | 70 | " words = re.findall(r\"\\w+@\\w+\\.\\w+|\\+\\d{1,3}-\\d{3}-\\d{3}-\\d{2}-\\d{2}|\\w+\", sentence)\n",
|
77 | 71 | " return words"
|
78 |
| - ], |
79 |
| - "metadata": { |
80 |
| - "collapsed": false |
81 |
| - }, |
82 |
| - "id": "bded99f420ae9022" |
| 72 | + ] |
83 | 73 | },
|
84 | 74 | {
|
85 | 75 | "cell_type": "code",
|
86 | 76 | "execution_count": null,
|
| 77 | + "id": "e627613a3632834b", |
| 78 | + "metadata": {}, |
87 | 79 | "outputs": [],
|
88 | 80 | "source": [
|
89 | 81 | "def save_to_file(original, lemmatized, stemmed, id, path):\n",
|
|
93 | 85 | " print(\"\", file=f)\n",
|
94 | 86 | " else:\n",
|
95 | 87 | " print(original[i], stemmed[i], lemmatized[i], sep=\"\\t\", file=f)"
|
96 |
| - ], |
97 |
| - "metadata": { |
98 |
| - "collapsed": false |
99 |
| - }, |
100 |
| - "id": "e627613a3632834b" |
| 88 | + ] |
101 | 89 | },
|
102 | 90 | {
|
103 | 91 | "cell_type": "code",
|
104 | 92 | "execution_count": null,
|
| 93 | + "id": "b2f4a0464b92971e", |
| 94 | + "metadata": {}, |
105 | 95 | "outputs": [],
|
106 | 96 | "source": [
|
107 | 97 | "from nltk import WordNetLemmatizer\n",
|
|
130 | 120 | " stemmed.append(sst.stem(w_processed))\n",
|
131 | 121 | " original.append(w_processed)\n",
|
132 | 122 | " save_to_file(original, lemmatized, stemmed, os.path.join(str(row['label']), f'{str(counter)}.tsv'), path)"
|
133 |
| - ], |
134 |
| - "metadata": { |
135 |
| - "collapsed": false |
136 |
| - }, |
137 |
| - "id": "b2f4a0464b92971e" |
| 123 | + ] |
138 | 124 | },
|
139 | 125 | {
|
140 | 126 | "cell_type": "code",
|
141 | 127 | "execution_count": null,
|
| 128 | + "id": "eae86957a0812dbe", |
| 129 | + "metadata": {}, |
142 | 130 | "outputs": [],
|
143 |
| - "source": [], |
144 |
| - "metadata": { |
145 |
| - "collapsed": false |
146 |
| - }, |
147 |
| - "id": "eae86957a0812dbe" |
| 131 | + "source": [] |
148 | 132 | },
|
149 | 133 | {
|
150 | 134 | "cell_type": "code",
|
151 | 135 | "execution_count": null,
|
| 136 | + "id": "9dc8c43fc3a82676", |
| 137 | + "metadata": {}, |
152 | 138 | "outputs": [],
|
153 | 139 | "source": [
|
154 | 140 | "df = pd.read_csv(os.path.join('../../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
|
155 | 141 | "df['text'] = (df['Title'] + '. ' + df['Description'])\n",
|
156 | 142 | "df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
|
157 | 143 | "\n",
|
158 | 144 | "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'train'))"
|
159 |
| - ], |
160 |
| - "metadata": { |
161 |
| - "collapsed": false |
162 |
| - }, |
163 |
| - "id": "9dc8c43fc3a82676" |
| 145 | + ] |
164 | 146 | },
|
165 | 147 | {
|
166 | 148 | "cell_type": "code",
|
167 | 149 | "execution_count": null,
|
| 150 | + "id": "65ad549a2d70ff6c", |
| 151 | + "metadata": {}, |
168 | 152 | "outputs": [],
|
169 | 153 | "source": [
|
170 | 154 | "df = pd.read_csv(os.path.join('../../dataset/raw/test.csv'), names=['label', 'Title', 'Description'])\n",
|
171 | 155 | "df['text'] = (df['Title'] + '. ' + df['Description'])\n",
|
172 | 156 | "df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
|
173 | 157 | "\n",
|
174 | 158 | "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'test'))"
|
175 |
| - ], |
176 |
| - "metadata": { |
177 |
| - "collapsed": false |
178 |
| - }, |
179 |
| - "id": "65ad549a2d70ff6c" |
| 159 | + ] |
180 | 160 | },
|
181 | 161 | {
|
182 | 162 | "cell_type": "code",
|
183 | 163 | "execution_count": null,
|
| 164 | + "id": "174c4b216214cc80", |
| 165 | + "metadata": {}, |
184 | 166 | "outputs": [],
|
185 |
| - "source": [], |
186 |
| - "metadata": { |
187 |
| - "collapsed": false |
188 |
| - }, |
189 |
| - "id": "174c4b216214cc80" |
| 167 | + "source": [] |
190 | 168 | }
|
191 | 169 | ],
|
192 | 170 | "metadata": {
|
193 |
| - "kernelspec": { |
194 |
| - "display_name": "Python 3", |
195 |
| - "language": "python", |
196 |
| - "name": "python3" |
197 |
| - }, |
198 | 171 | "language_info": {
|
199 |
| - "codemirror_mode": { |
200 |
| - "name": "ipython", |
201 |
| - "version": 2 |
202 |
| - }, |
203 |
| - "file_extension": ".py", |
204 |
| - "mimetype": "text/x-python", |
205 |
| - "name": "python", |
206 |
| - "nbconvert_exporter": "python", |
207 |
| - "pygments_lexer": "ipython2", |
208 |
| - "version": "2.7.6" |
| 172 | + "name": "python" |
209 | 173 | }
|
210 | 174 | },
|
211 | 175 | "nbformat": 4,
|
|
0 commit comments