Skip to content

Commit

Permalink
First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
kpaviles committed Jan 3, 2025
1 parent e6a69b1 commit 71a70b3
Show file tree
Hide file tree
Showing 16 changed files with 5,364,655 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/myenv
*.env
/csv_files
57 changes: 57 additions & 0 deletions GroomingDetector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import os
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers.string import StrOutputParser
import pandas as pd

PROMPT_TEMPLATE = """
You are an expert on detecting grooming on chat conversations.
This are some grooming chat examples, keep in mind that conversation
messages are separated by a | character.
{context}
---
Taking into account the previous examples, do you identify any grooming behavior
in the next chat? Answer if the conversation is grooming or not, and
give the literal text that makes you think so.
{question}
"""
def join_docs(chunks):
return "\n\n".join(chunk.page_content for chunk in chunks)

class GroomingDetector:
def __init__(self, qdrant_url, qdrant_key, temperature=0):
self.qdrant_url = qdrant_url
self.qdrant_key = qdrant_key
self.embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")
self.qdrant, self.retriever = self.load_vectorstore()
self.prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
self.llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=temperature)
self.chain = self.load_chain()

def load_vectorstore(self):
qdrant = QdrantVectorStore.from_existing_collection(
embedding=self.embedding_function,
collection_name="groom_chats",
url=self.qdrant_url,
api_key=self.qdrant_key,
)
retriever = qdrant.as_retriever()
return qdrant, retriever

def load_chain(self):
chain = (
{"context": self.retriever | join_docs, "question": RunnablePassthrough()}
| self.prompt
| self.llm
| StrOutputParser()
)
return chain

def invoke(self, text):
return self.chain.invoke(text)
59 changes: 59 additions & 0 deletions autoload_qdrant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from dotenv import load_dotenv
import os
from preprocess.clear_data import xml2csv
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_openai import OpenAIEmbeddings
from langchain_qdrant import QdrantVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv()

# Load the "training" data, which will be the Document Store
CORPUS_DATA_PATH="../pan12-sexual-predator/training/conversations.xml"
PREDATORS_DATA_PATH="../pan12-sexual-predator/training/predator-ids.txt"
CORPUS_TEST_DATA_PATH="../pan12-sexual-predator/test/conversations.xml"
PREDATORS_TEST_DATA_PATH="../pan12-sexual-predator/test/predator-ids.txt"

os.makedirs("csv_files", exist_ok=True)
# Load the training data, in this case only load the abusive cases to store them in Qdrant.
xml2csv(nameXML=CORPUS_DATA_PATH,nameCSV="csv_files/abusive_text.csv",predatorsTXT=PREDATORS_DATA_PATH, only_abusive=True)
# Load the test data, in this case load all the cases to test the model.
xml2csv(nameXML=CORPUS_TEST_DATA_PATH,nameCSV="csv_files/abusive_text_test.csv",predatorsTXT=PREDATORS_TEST_DATA_PATH, only_abusive=False)

# Load the training data using the doc loader from langchain, make sure to specify the content,
# source and metadata columns
loader = CSVLoader(
file_path="csv_files/abusive_text.csv",
csv_args={
"delimiter": ";",
"fieldnames": ["CONVERSATION_ID", "AUTHORS_IDS", "IS_ABUSIVE", "CONVERSATION_TEXT"],
},
content_columns=["CONVERSATION_TEXT"],
source_column="CONVERSATION_ID",
metadata_columns=["AUTHORS_IDS"],
)

data = loader.load()

# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'splits' holds the text you want to split, split the text into documents using the text splitter.
splits = text_splitter.split_documents(data)

# Use Qdrant to load the data into the vector store, using an embedding function to convert the text into vectors.
qdrant_url = os.getenv("QDRANT_URL")
qdrant_key = os.getenv("QDRANT_API_KEY")

embedding_function = OpenAIEmbeddings(model="text-embedding-3-large")

qdrant = QdrantVectorStore.from_documents(
splits,
embedding_function,
url=qdrant_url,
prefer_grpc=True,
api_key=qdrant_key,
collection_name="groom_chats",
)
36 changes: 36 additions & 0 deletions et --soft HEAD~2
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
commit a6a7cacb26c6ab60cbec713d36f1293005fd4fea (HEAD -> main)
Author: kpaviles <kpam2435@gmail.com>
Date: Thu Jan 2 23:36:27 2025 -0500

Modified xml docs

commit 818493195494d89456270f39310e12e3ccb9b774
Merge: 57444be e6a69b1
Author: kpaviles <kpam2435@gmail.com>
Date: Thu Jan 2 23:21:46 2025 -0500

Merge branch 'main' of github.com:kaviles22/Grooming-Analysis

commit 57444beaf3c49614b1057ca657860ea601896dbe
Author: kpaviles <kpam2435@gmail.com>
Date: Thu Jan 2 23:21:29 2025 -0500

First commit

commit e6a69b1b21ee84ba8168786fa80edc70aacafcf2 (origin/main, origin/HEAD)
Author: Karla Pavlova Aviles <59234567+kaviles22@users.noreply.github.com>
Date: Thu Jan 2 23:20:37 2025 -0500

Rename .github/autoload_qdrant.yml to .github/workflows/autoload_qdrant.yml

commit a0923149f86675cb97ad83eba03b74923976da75
Author: Karla Pavlova Aviles <59234567+kaviles22@users.noreply.github.com>
Date: Thu Jan 2 23:19:05 2025 -0500

Create autoload_qdrant.yml

commit a3cc3b92171c865c1c3bc0bc6b605a39ee4f6dd3
Author: Karla Pavlova Aviles <59234567+kaviles22@users.noreply.github.com>
Date: Thu Jan 2 12:30:30 2025 -0500

Initial commit
Loading

0 comments on commit 71a70b3

Please sign in to comment.