-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
5,364,655 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
/myenv | ||
*.env | ||
/csv_files |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
import os | ||
from langchain_openai import OpenAIEmbeddings | ||
from langchain_qdrant import QdrantVectorStore | ||
from langchain.prompts import ChatPromptTemplate | ||
from langchain_openai import ChatOpenAI | ||
from langchain_core.runnables import RunnablePassthrough | ||
from langchain_core.output_parsers.string import StrOutputParser | ||
import pandas as pd | ||
|
||
PROMPT_TEMPLATE = """ | ||
You are an expert on detecting grooming on chat conversations. | ||
This are some grooming chat examples, keep in mind that conversation | ||
messages are separated by a | character. | ||
{context} | ||
--- | ||
Taking into account the previous examples, do you identify any grooming behavior | ||
in the next chat? Answer if the conversation is grooming or not, and | ||
give the literal text that makes you think so. | ||
{question} | ||
""" | ||
def join_docs(chunks): | ||
return "\n\n".join(chunk.page_content for chunk in chunks) | ||
|
||
class GroomingDetector: | ||
def __init__(self, qdrant_url, qdrant_key, temperature=0): | ||
self.qdrant_url = qdrant_url | ||
self.qdrant_key = qdrant_key | ||
self.embedding_function = OpenAIEmbeddings(model="text-embedding-3-large") | ||
self.qdrant, self.retriever = self.load_vectorstore() | ||
self.prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) | ||
self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=temperature) | ||
self.chain = self.load_chain() | ||
|
||
def load_vectorstore(self): | ||
qdrant = QdrantVectorStore.from_existing_collection( | ||
embedding=self.embedding_function, | ||
collection_name="groom_chats", | ||
url=self.qdrant_url, | ||
api_key=self.qdrant_key, | ||
) | ||
retriever = qdrant.as_retriever() | ||
return qdrant, retriever | ||
|
||
def load_chain(self): | ||
chain = ( | ||
{"context": self.retriever | join_docs, "question": RunnablePassthrough()} | ||
| self.prompt | ||
| self.llm | ||
| StrOutputParser() | ||
) | ||
return chain | ||
|
||
def invoke(self, text): | ||
return self.chain.invoke(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from dotenv import load_dotenv | ||
import os | ||
from preprocess.clear_data import xml2csv | ||
from langchain_community.document_loaders.csv_loader import CSVLoader | ||
from langchain_openai import OpenAIEmbeddings | ||
from langchain_qdrant import QdrantVectorStore | ||
from langchain_openai import OpenAIEmbeddings | ||
from langchain_text_splitters import RecursiveCharacterTextSplitter | ||
|
||
load_dotenv() | ||
|
||
# Load the "training" data, which will be the Document Store | ||
CORPUS_DATA_PATH="../pan12-sexual-predator/training/conversations.xml" | ||
PREDATORS_DATA_PATH="../pan12-sexual-predator/training/predator-ids.txt" | ||
CORPUS_TEST_DATA_PATH="../pan12-sexual-predator/test/conversations.xml" | ||
PREDATORS_TEST_DATA_PATH="../pan12-sexual-predator/test/predator-ids.txt" | ||
|
||
os.makedirs("csv_files", exist_ok=True) | ||
# Load the training data, in this case only load the abusive cases to store them in Qdrant. | ||
xml2csv(nameXML=CORPUS_DATA_PATH,nameCSV="csv_files/abusive_text.csv",predatorsTXT=PREDATORS_DATA_PATH, only_abusive=True) | ||
# Load the test data, in this case load all the cases to test the model. | ||
xml2csv(nameXML=CORPUS_TEST_DATA_PATH,nameCSV="csv_files/abusive_text_test.csv",predatorsTXT=PREDATORS_TEST_DATA_PATH, only_abusive=False) | ||
|
||
# Load the training data using the doc loader from langchain, make sure to specify the content, | ||
# source and metadata columns | ||
loader = CSVLoader( | ||
file_path="csv_files/abusive_text.csv", | ||
csv_args={ | ||
"delimiter": ";", | ||
"fieldnames": ["CONVERSATION_ID", "AUTHORS_IDS", "IS_ABUSIVE", "CONVERSATION_TEXT"], | ||
}, | ||
content_columns=["CONVERSATION_TEXT"], | ||
source_column="CONVERSATION_ID", | ||
metadata_columns=["AUTHORS_IDS"], | ||
) | ||
|
||
data = loader.load() | ||
|
||
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters. | ||
# It splits text into chunks of 1000 characters each with a 150-character overlap. | ||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150) | ||
|
||
# 'splits' holds the text you want to split, split the text into documents using the text splitter. | ||
splits = text_splitter.split_documents(data) | ||
|
||
# Use Qdrant to load the data into the vector store, using an embedding function to convert the text into vectors. | ||
qdrant_url = os.getenv("QDRANT_URL") | ||
qdrant_key = os.getenv("QDRANT_API_KEY") | ||
|
||
embedding_function = OpenAIEmbeddings(model="text-embedding-3-large") | ||
|
||
qdrant = QdrantVectorStore.from_documents( | ||
splits, | ||
embedding_function, | ||
url=qdrant_url, | ||
prefer_grpc=True, | ||
api_key=qdrant_key, | ||
collection_name="groom_chats", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
[33mcommit a6a7cacb26c6ab60cbec713d36f1293005fd4fea[m[33m ([m[1;36mHEAD[m[33m -> [m[1;32mmain[m[33m)[m | ||
Author: kpaviles <kpam2435@gmail.com> | ||
Date: Thu Jan 2 23:36:27 2025 -0500 | ||
|
||
Modified xml docs | ||
|
||
[33mcommit 818493195494d89456270f39310e12e3ccb9b774[m | ||
Merge: 57444be e6a69b1 | ||
Author: kpaviles <kpam2435@gmail.com> | ||
Date: Thu Jan 2 23:21:46 2025 -0500 | ||
|
||
Merge branch 'main' of github.com:kaviles22/Grooming-Analysis | ||
|
||
[33mcommit 57444beaf3c49614b1057ca657860ea601896dbe[m | ||
Author: kpaviles <kpam2435@gmail.com> | ||
Date: Thu Jan 2 23:21:29 2025 -0500 | ||
|
||
First commit | ||
|
||
[33mcommit e6a69b1b21ee84ba8168786fa80edc70aacafcf2[m[33m ([m[1;31morigin/main[m[33m, [m[1;31morigin/HEAD[m[33m)[m | ||
Author: Karla Pavlova Aviles <59234567+kaviles22@users.noreply.github.com> | ||
Date: Thu Jan 2 23:20:37 2025 -0500 | ||
|
||
Rename .github/autoload_qdrant.yml to .github/workflows/autoload_qdrant.yml | ||
|
||
[33mcommit a0923149f86675cb97ad83eba03b74923976da75[m | ||
Author: Karla Pavlova Aviles <59234567+kaviles22@users.noreply.github.com> | ||
Date: Thu Jan 2 23:19:05 2025 -0500 | ||
|
||
Create autoload_qdrant.yml | ||
|
||
[33mcommit a3cc3b92171c865c1c3bc0bc6b605a39ee4f6dd3[m | ||
Author: Karla Pavlova Aviles <59234567+kaviles22@users.noreply.github.com> | ||
Date: Thu Jan 2 12:30:30 2025 -0500 | ||
|
||
Initial commit |
Oops, something went wrong.