MANASLU8 · ADmangarakov · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024 · Jan 18, 2024
diff --git a/README.md b/README.md
@@ -33,3 +33,4 @@
 | [news_nlp](/projects/news_nlp) | Строкова Анастасия Владиславовна |
 | [newsgroups-classification](/projects/newsgroups-classification) | Герасимчук Михаил Юрьевич |
 | [fake-news-classifier](/projects/fake-news-classifier) | Артемьев Алексей Дмитриевич |
+| [ad-news-classification](/projects/ad-news-classification) | Мангараков Александр Дмитриевич |
diff --git a/projects/ad-news-classification/lab_1/nlp_1.ipynb b/projects/ad-news-classification/lab_1/nlp_1.ipynb
@@ -0,0 +1,177 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('omw-1.4')\n",
+    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10d57caa4259352a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "df = pd.read_csv(os.path.join('../../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
+    "df['text'] = (df['Title'] + '. ' + df['Description'])\n",
+    "df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
+    "print(df.head())\n",
+    "print(df['text'][1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad6c6f57643b7841",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['label']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f16b78e23c14172f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "def split_to_sent(text):\n",
+    "    text = re.sub(r\"(?<=&lt;).*?(?=&gt;)\", \" \", text)\n",
+    "    text = re.sub(r\"&gt;\", \" \", text)\n",
+    "    text = re.sub(r\"&lt;\", \" \", text)\n",
+    "    sentences = re.split(\n",
+    "        r\"(((?<!\\w\\.\\w.)(?<!\\s\\w\\.)(?<![A-Z][a-z]\\.)(?<=\\.|\\?|\\!)\\s(?=[A-Z]))|((?<![\\,\\-\\:])\\n(?=[A-Z]|\\\" )))\", text)[::4]\n",
+    "    return sentences"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bded99f420ae9022",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def split_to_words(sentence):\n",
+    "    words = re.findall(r\"\\w+@\\w+\\.\\w+|\\+\\d{1,3}-\\d{3}-\\d{3}-\\d{2}-\\d{2}|\\w+\", sentence)\n",
+    "    return words"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e627613a3632834b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def save_to_file(original, lemmatized, stemmed, id, path):\n",
+    "    with open(os.path.join(path, id), \"w\") as f:\n",
+    "        for i in range(len(original)):\n",
+    "            if original[i] == \"\\n\":\n",
+    "                print(\"\", file=f)\n",
+    "            else:\n",
+    "                print(original[i], stemmed[i], lemmatized[i], sep=\"\\t\", file=f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2f4a0464b92971e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk import WordNetLemmatizer\n",
+    "from nltk import SnowballStemmer\n",
+    "import nltk\n",
+    "nltk.download('wordnet')\n",
+    "\n",
+    "\n",
+    "def process_file(df, path):\n",
+    "    wnl = WordNetLemmatizer()\n",
+    "    sst = SnowballStemmer(\"english\")\n",
+    "    counter = 0\n",
+    "    for index, row in df.iterrows():\n",
+    "        sentences = split_to_sent(row['text'])\n",
+    "        words_dic = []\n",
+    "        counter += 1\n",
+    "        for s in sentences:\n",
+    "            words_dic += split_to_words(s)\n",
+    "            words_dic.append(\"\\n\")\n",
+    "        lemmatized = []\n",
+    "        stemmed = []\n",
+    "        original = []\n",
+    "        for w in words_dic:\n",
+    "            w_processed = re.sub(r\"[.!?,]$\", \"\", w).lower()\n",
+    "            lemmatized.append(wnl.lemmatize(w_processed))\n",
+    "            stemmed.append(sst.stem(w_processed))\n",
+    "            original.append(w_processed)\n",
+    "        save_to_file(original, lemmatized, stemmed, os.path.join(str(row['label']), f'{str(counter)}.tsv'), path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eae86957a0812dbe",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9dc8c43fc3a82676",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(os.path.join('../../dataset/raw/train.csv'), names=['label', 'Title', 'Description'])\n",
+    "df['text'] = (df['Title'] + '. ' + df['Description'])\n",
+    "df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
+    "\n",
+    "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'train'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "65ad549a2d70ff6c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(os.path.join('../../dataset/raw/test.csv'), names=['label', 'Title', 'Description'])\n",
+    "df['text'] = (df['Title'] + '. ' + df['Description'])\n",
+    "df.drop(columns=['Title', 'Description'], axis=1, inplace=True)\n",
+    "\n",
+    "process_file(df, os.path.join('..', 'assets', 'annotated-corpus', 'test'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "174c4b216214cc80",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}