Skip to content

Commit fa459ec

Browse files
committed
Правки ревью
1 parent 3264a20 commit fa459ec

File tree

5 files changed

+423
-21815
lines changed

5 files changed

+423
-21815
lines changed

projects/ad-news-classification/lab_1/nlp_1.ipynb

+32-68
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
"cell_type": "code",
55
"execution_count": null,
66
"id": "initial_id",
7-
"metadata": {
8-
"collapsed": true
9-
},
7+
"metadata": {},
108
"outputs": [],
119
"source": [
1210
"import nltk\n",
@@ -20,6 +18,8 @@
2018
{
2119
"cell_type": "code",
2220
"execution_count": null,
21+
"id": "10d57caa4259352a",
22+
"metadata": {},
2323
"outputs": [],
2424
"source": [
2525
"import os\n",
@@ -29,27 +29,23 @@
2929
"df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
3030
"print(df.head())\n",
3131
"print(df['text'][1])"
32-
],
33-
"metadata": {
34-
"collapsed": false
35-
},
36-
"id": "10d57caa4259352a"
32+
]
3733
},
3834
{
3935
"cell_type": "code",
4036
"execution_count": null,
37+
"id": "ad6c6f57643b7841",
38+
"metadata": {},
4139
"outputs": [],
4240
"source": [
4341
"df['label']"
44-
],
45-
"metadata": {
46-
"collapsed": false
47-
},
48-
"id": "ad6c6f57643b7841"
42+
]
4943
},
5044
{
5145
"cell_type": "code",
5246
"execution_count": null,
47+
"id": "f16b78e23c14172f",
48+
"metadata": {},
5349
"outputs": [],
5450
"source": [
5551
"import re\n",
@@ -61,29 +57,25 @@
6157
" sentences = re.split(\n",
6258
" r\"(((?<!\\w\\.\\w.)(?<!\\s\\w\\.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?|\\!)\\s(?=[A-Z]))|((?<![\\,\\-\\:])\\n(?=[A-Z]|\\\" )))\", text)[::4]\n",
6359
" return sentences"
64-
],
65-
"metadata": {
66-
"collapsed": false
67-
},
68-
"id": "f16b78e23c14172f"
60+
]
6961
},
7062
{
7163
"cell_type": "code",
7264
"execution_count": null,
65+
"id": "bded99f420ae9022",
66+
"metadata": {},
7367
"outputs": [],
7468
"source": [
7569
"def split_to_words(sentence):\n",
7670
" words = re.findall(r\"\\w+@\\w+\\.\\w+|\\+\\d{1,3}-\\d{3}-\\d{3}-\\d{2}-\\d{2}|\\w+\", sentence)\n",
7771
" return words"
78-
],
79-
"metadata": {
80-
"collapsed": false
81-
},
82-
"id": "bded99f420ae9022"
72+
]
8373
},
8474
{
8575
"cell_type": "code",
8676
"execution_count": null,
77+
"id": "e627613a3632834b",
78+
"metadata": {},
8779
"outputs": [],
8880
"source": [
8981
"def save_to_file(original, lemmatized, stemmed, id, path):\n",
@@ -93,15 +85,13 @@
9385
" print(\"\", file=f)\n",
9486
" else:\n",
9587
" print(original[i], stemmed[i], lemmatized[i], sep=\"\\t\", file=f)"
96-
],
97-
"metadata": {
98-
"collapsed": false
99-
},
100-
"id": "e627613a3632834b"
88+
]
10189
},
10290
{
10391
"cell_type": "code",
10492
"execution_count": null,
93+
"id": "b2f4a0464b92971e",
94+
"metadata": {},
10595
"outputs": [],
10696
"source": [
10797
"from nltk import WordNetLemmatizer\n",
@@ -130,82 +120,56 @@
130120
" stemmed.append(sst.stem(w_processed))\n",
131121
" original.append(w_processed)\n",
132122
" save_to_file(original, lemmatized, stemmed, os.path.join(str(row['label']), f'{str(counter)}.tsv'), path)"
133-
],
134-
"metadata": {
135-
"collapsed": false
136-
},
137-
"id": "b2f4a0464b92971e"
123+
]
138124
},
139125
{
140126
"cell_type": "code",
141127
"execution_count": null,
128+
"id": "eae86957a0812dbe",
129+
"metadata": {},
142130
"outputs": [],
143-
"source": [],
144-
"metadata": {
145-
"collapsed": false
146-
},
147-
"id": "eae86957a0812dbe"
131+
"source": []
148132
},
149133
{
150134
"cell_type": "code",
151135
"execution_count": null,
136+
"id": "9dc8c43fc3a82676",
137+
"metadata": {},
152138
"outputs": [],
153139
"source": [
154140
"df = pd.read_csv(os.path.join('../../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
155141
"df['text'] = (df['Title'] + '. ' + df['Description'])\n",
156142
"df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
157143
"\n",
158144
"process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'train'))"
159-
],
160-
"metadata": {
161-
"collapsed": false
162-
},
163-
"id": "9dc8c43fc3a82676"
145+
]
164146
},
165147
{
166148
"cell_type": "code",
167149
"execution_count": null,
150+
"id": "65ad549a2d70ff6c",
151+
"metadata": {},
168152
"outputs": [],
169153
"source": [
170154
"df = pd.read_csv(os.path.join('../../dataset/raw/test.csv'), names=['label', 'Title', 'Description'])\n",
171155
"df['text'] = (df['Title'] + '. ' + df['Description'])\n",
172156
"df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
173157
"\n",
174158
"process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'test'))"
175-
],
176-
"metadata": {
177-
"collapsed": false
178-
},
179-
"id": "65ad549a2d70ff6c"
159+
]
180160
},
181161
{
182162
"cell_type": "code",
183163
"execution_count": null,
164+
"id": "174c4b216214cc80",
165+
"metadata": {},
184166
"outputs": [],
185-
"source": [],
186-
"metadata": {
187-
"collapsed": false
188-
},
189-
"id": "174c4b216214cc80"
167+
"source": []
190168
}
191169
],
192170
"metadata": {
193-
"kernelspec": {
194-
"display_name": "Python 3",
195-
"language": "python",
196-
"name": "python3"
197-
},
198171
"language_info": {
199-
"codemirror_mode": {
200-
"name": "ipython",
201-
"version": 2
202-
},
203-
"file_extension": ".py",
204-
"mimetype": "text/x-python",
205-
"name": "python",
206-
"nbconvert_exporter": "python",
207-
"pygments_lexer": "ipython2",
208-
"version": "2.7.6"
172+
"name": "python"
209173
}
210174
},
211175
"nbformat": 4,

0 commit comments

Comments
 (0)