app.py

import logging
from logging.handlers import RotatingFileHandler
import sys
from transformers import pipeline
import os
from llama_index.core.postprocessor import NERPIINodePostprocessor
from llama_index.core.schema import TextNode
import getpass
import streamlit as st
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores.faiss import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
log_file = 'CHAT@2.log'

def setup_logger(log_file):
    logger = logging.getLogger(__name__)# Create a logger
    logger.setLevel(logging.DEBUG)  # Set the logging level
    # Create a rotating file handler
    rotating_handler = RotatingFileHandler(log_file, maxBytes=1024*1024, backupCount=5)  # Max file size 1MB, keep up to 5 backup files
    rotating_handler.setLevel(logging.DEBUG)  # Set the logging level for the handler
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')# Create a formatter
    rotating_handler.setFormatter(formatter)# Add the formatter to the handler
    logger.addHandler(rotating_handler)# Add the rotating file handler to the logger
    return logger
logger = setup_logger(log_file)
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
load_dotenv()
os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
ner_pipeline = pipeline(
    'token-classification', 
    model=r'djagatiya/ner-roberta-base-ontonotesv5-englishv4',
    aggregation_strategy='simple'
)
def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    return text

def get_text_chunks(text):
    try:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
        chunks = text_splitter.split_text(text)
    except Exception as e:
        logger.error(f"Error at get_text_chunks function : {e}")
        logging.error(f"Error at get_text_chunks function : {e}")
    return chunks

def get_vector_store(text_chunks):
    try:
        embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
        # embeddings =  OpenAIEmbeddings()
        vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
        vector_store.save_local("faiss_index")
    except Exception as e:
        logger.error(f"Error at get_vector_store function : {e}")
        logging.error(f"Error at get_vector_store function : {e}")
        logger.debug(f"Vector store from get_vector_store function : {vector_store} ")
    return vector_store

def get_conversational_chain():
    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "Answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n {context}?\n
    Question: \n{question}\n
    Answer:
    """
    try:
        model = ChatGoogleGenerativeAI(model="gemini-pro",temperature=0.3)
        prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
        chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)
    except Exception as e:
        logger.error(f"Error at get_conversational_chain function : {e}")
        logging.error(f"Error at get_conversational_chain function : {e}")
    return chain

def user_input(user_question):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
    # embeddings = OpenAIEmbeddings()
    new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
    docs = new_db.similarity_search(user_question)
    chain = get_conversational_chain()
    response = chain(
        {"input_documents":docs, "question": user_question}
        , return_only_outputs=True)
    print(response)
    st.write("Reply: ", response["output_text"])

def get_entities_from_text(raw_text, ner_pipeline):
    """Extracts entities from text using the provided NER pipeline and displays them in a Streamlit table with color-coded entity groups."""
    try:
        response_NER = ner_pipeline(inputs=raw_text)
        total_entities = len(response_NER)
        st.write(f"Total entities recognized in the document: {total_entities}")
        logger.info(f"Response : {response_NER}")
        # Create a dictionary to map entity_group to its color
        entity_group_colors = {
            "CARDINAL": "red",
            "DATE": "yellow",
            "EVENT": "lightblue",
            "FAC": "red",
            "GPE": "lightpink",
            "LANGUAGE": "darkyellow",
            "LAW": "black",
            "LOC": "lightbrown",
            "MONEY": "lightgreen",
            "NORP": "grey",
            "ORDINAL": "magenta",
            "ORG": "orange",
            "PERCENT": "darkpink",
            "PERSON": "mintcream",  # Using MintCream for better visibility
            "PRODUCT": "cyan",
            "QUANTITY": "lavender",
            "TIME": "apricot",
            "WORK_OF_ART": "teal",
            "micro avg": "gold",
            "macro avg": "maroon",
            "weighted avg": "beige"
        }
        entities = []# Create an empty list to store entities
        for entity in response_NER:
            entity_group = entity['entity_group']
            word = entity['word']
            score = entity['score']
            color = entity_group_colors.get(entity_group, "lightgray")  # Default color for unknown groups
            # HTML to create the colored box
            colored_box_html = f'<div style="background-color:{color}; padding:2px; border-radius:3px;">{entity_group}</div>'
            # Append entity information with colored box HTML
            entities.append([word, colored_box_html, score])
    except Exception as e:
        logger.info(f"Error at get_entities_from_text function : {e}")
        logging.info(f"Error at get_entities_from_text function : {e}")
    # Display table only if there are entities
    return entities if entities else "No"

    
def main():
    st.set_page_config("Chat PDF")
    st.header("Chat with PDF using Gemini💁")
    user_question = st.text_input("Ask a Question from the PDF Files")
    if user_question:
        user_input(user_question)
    with st.sidebar:
        st.title("Menu:")
        pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
        if st.button("Submit & Process"):
            with st.spinner("Processing..."):
                raw_text = get_pdf_text(pdf_docs)
                text_chunks = get_text_chunks(raw_text)
                get_vector_store(text_chunks)
                st.success("Done")
    clicked = st.button("Recognise Entity")  # Add a button for entity recognition
    if clicked:  # Execute entity recognition only if the "Recognise Entity" button was clicked
        raw_text = get_pdf_text(pdf_docs)
        entities_table = get_entities_from_text(raw_text, ner_pipeline)  # Replace ner_pipeline with your actual instance
        st.data_editor(entities_table,use_container_width=True,num_rows="fixed")
    # text_entities = st.text_area("Enter Text Here:", height=100)
    # # Call the function to process text and display results
    # if raw_text:
        # get_entities_from_text(text_entities, ner_pipeline)  # Replace ner_pipeline with your actual instance

if __name__ == "__main__":
    main()