itsmohitkumar
diff --git a/‎app.py‎
Lines changed: 1 addition & 1 deletion b/‎app.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎data/Resume.pdf‎
43.6 KB b/‎data/Resume.pdf‎
43.6 KB
diff --git a/‎data/tax.pdf‎
-574 KB b/‎data/tax.pdf‎
-574 KB
diff --git a/‎setup.py‎
Lines changed: 111 additions & 0 deletions b/‎setup.py‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎src/chatbot/chatbot.py‎ b/‎src/chatbot/chatbot.py‎
diff --git a/‎src/chatbot/config.py‎
Lines changed: 31 additions & 0 deletions b/‎src/chatbot/config.py‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/chatbot/data_loader.py‎ b/‎src/chatbot/data_loader.py‎
diff --git a/‎src/chatbot/pinecone_service.py‎ b/‎src/chatbot/pinecone_service.py‎
diff --git a/‎src/chatbot/services.py‎
Lines changed: 111 additions & 0 deletions b/‎src/chatbot/services.py‎
Lines changed: 111 additions & 0 deletions
@@ -259,4 +259,4 @@ async def ask_question(request: QuestionRequest):
         raise e
     except Exception as e:
         logger.error(f"Error in /ask endpoint: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail="Error processing request")
+        raise HTTPException(status_code=500, detail="Error processing request")
@@ -0,0 +1,111 @@
+import os
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import List
+from fastapi import HTTPException
+from langchain_aws import BedrockLLM
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain_community.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from src.chatbot.prompts import PROMPT
+from src.chatbot.config import MAX_THREADS, CHUNK_SIZE, CHUNK_OVERLAP
+
+logger = logging.getLogger(__name__)
+
+# PDF Document Processor
+class PDFDocumentProcessor:
+    def __init__(self, data_directory: str):
+        self.data_directory = data_directory
+
+    def load_and_chunk_documents(self) -> List[str]:
+        start_time = time.time()
+        try:
+            loader = PyPDFDirectoryLoader(self.data_directory)
+            logger.info(f"Loading PDFs from {self.data_directory}...")
+            documents = loader.load()
+
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+            
+            # Parallel chunking
+            with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
+                chunked_documents = list(executor.map(text_splitter.split_documents, [documents]))
+
+            logger.info(f"Document loading and chunking completed in {time.time() - start_time:.2f} seconds.")
+            return [chunk for sublist in chunked_documents for chunk in sublist]
+        except FileNotFoundError:
+            logger.error(f"Data directory '{self.data_directory}' not found.")
+            raise HTTPException(status_code=404, detail="Data directory not found")
+        except Exception as e:
+            logger.error(f"Error loading and chunking documents: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error processing documents")
+
+# FAISS Manager
+class FAISSManager:
+    def __init__(self, index_path: str, embeddings):
+        self.index_path = index_path
+        self.embeddings = embeddings
+
+    def create_and_save_vector_store(self, chunked_documents: List[str]):
+        try:
+            # Ensure the directory for the index exists
+            index_dir = os.path.dirname(self.index_path)
+            if not os.path.exists(index_dir):
+                os.makedirs(index_dir)
+                logger.info(f"Created directory for FAISS index: {index_dir}")
+
+            vectorstore_faiss = FAISS.from_documents(chunked_documents, self.embeddings)
+            vectorstore_faiss.save_local(self.index_path)
+            logger.info(f"FAISS index created and saved to {self.index_path}.")
+        except Exception as e:
+            logger.error(f"Error creating and saving FAISS vector store: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error creating FAISS vector store")
+
+    def load_vector_store(self):
+        try:
+            # Check if the FAISS index file exists before loading
+            if not os.path.exists(self.index_path):
+                logger.error(f"FAISS index file '{self.index_path}' not found. Creating a new index might be required.")
+                raise HTTPException(status_code=404, detail="FAISS index not found")
+
+            logger.info(f"Loading FAISS index from {self.index_path}...")
+            return FAISS.load_local(self.index_path, self.embeddings, allow_dangerous_deserialization=True)
+        except FileNotFoundError:
+            logger.error(f"FAISS index file '{self.index_path}' not found.")
+            raise HTTPException(status_code=404, detail="FAISS index not found")
+        except Exception as e:
+            logger.error(f"Error loading FAISS vector store: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error loading FAISS vector store")
+
+# LLM Service
+class LLMService:
+    def __init__(self, model_id: str, client):
+        self.model_id = model_id
+        self.client = client
+
+    def initialize_llm(self):
+        try:
+            logger.info(f"Initializing LLM with model ID: {self.model_id}")
+            return BedrockLLM(model_id=self.model_id, client=self.client)
+        except Exception as e:
+            logger.error(f"Error initializing LLM: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error initializing LLM")
+
+    def generate_response(self, llm, vectorstore_faiss, query: str):
+        try:
+            start_time = time.time()
+            logger.info(f"Generating response for query: '{query}'")
+            qa = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type="stuff",
+                retriever=vectorstore_faiss.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
+                return_source_documents=True,
+                chain_type_kwargs={"prompt": PROMPT}
+            )
+            result = qa.invoke({"query": query})
+            logger.info(f"Response generated in {time.time() - start_time:.2f} seconds.")
+            return result['result']
+        except Exception as e:
+            logger.error(f"Error generating LLM response: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error generating response")
@@ -0,0 +1,31 @@
+import json
+from dotenv import load_dotenv
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Load configuration from config.json
+try:
+    with open("src/config.json") as config_file:
+        config = json.load(config_file)
+except FileNotFoundError:
+    raise RuntimeError("Configuration file 'src/config.json' not found.")
+
+# Configuration variables from config.json
+DATA_DIRECTORY = config.get("DATA_DIRECTORY")
+FAISS_INDEX_PATH = config.get("FAISS_INDEX_PATH")
+TITAN_MODEL_ID = config.get("TITAN_MODEL_ID")
+LLAMA_MODEL_ID = config.get("LLAMA_MODEL_ID")
+CHUNK_SIZE = config.get("CHUNK_SIZE")
+CHUNK_OVERLAP = config.get("CHUNK_OVERLAP")
+MAX_THREADS = config.get("MAX_THREADS")
+LOG_LEVEL = config.get("LOG_LEVEL", "INFO").upper()
+
+# Validate configuration variables
+required_configs = [
+    DATA_DIRECTORY, FAISS_INDEX_PATH, TITAN_MODEL_ID, LLAMA_MODEL_ID, 
+    CHUNK_SIZE, CHUNK_OVERLAP, MAX_THREADS
+]
+
+if any(config is None for config in required_configs):
+    raise ValueError("Missing required configuration in config.json")
@@ -0,0 +1,111 @@
+import os
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from typing import List
+from fastapi import HTTPException
+from langchain_aws import BedrockLLM
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain_community.vectorstores import FAISS
+from langchain.chains import RetrievalQA
+from src.chatbot.prompts import PROMPT
+from src.chatbot.config import MAX_THREADS, CHUNK_SIZE, CHUNK_OVERLAP
+
+logger = logging.getLogger(__name__)
+
+# PDF Document Processor
+class PDFDocumentProcessor:
+    def __init__(self, data_directory: str):
+        self.data_directory = data_directory
+
+    def load_and_chunk_documents(self) -> List[str]:
+        start_time = time.time()
+        try:
+            loader = PyPDFDirectoryLoader(self.data_directory)
+            logger.info(f"Loading PDFs from {self.data_directory}...")
+            documents = loader.load()
+
+            text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+            
+            # Parallel chunking
+            with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
+                chunked_documents = list(executor.map(text_splitter.split_documents, [documents]))
+
+            logger.info(f"Document loading and chunking completed in {time.time() - start_time:.2f} seconds.")
+            return [chunk for sublist in chunked_documents for chunk in sublist]
+        except FileNotFoundError:
+            logger.error(f"Data directory '{self.data_directory}' not found.")
+            raise HTTPException(status_code=404, detail="Data directory not found")
+        except Exception as e:
+            logger.error(f"Error loading and chunking documents: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error processing documents")
+
+# FAISS Manager
+class FAISSManager:
+    def __init__(self, index_path: str, embeddings):
+        self.index_path = index_path
+        self.embeddings = embeddings
+
+    def create_and_save_vector_store(self, chunked_documents: List[str]):
+        try:
+            # Ensure the directory for the index exists
+            index_dir = os.path.dirname(self.index_path)
+            if not os.path.exists(index_dir):
+                os.makedirs(index_dir)
+                logger.info(f"Created directory for FAISS index: {index_dir}")
+
+            vectorstore_faiss = FAISS.from_documents(chunked_documents, self.embeddings)
+            vectorstore_faiss.save_local(self.index_path)
+            logger.info(f"FAISS index created and saved to {self.index_path}.")
+        except Exception as e:
+            logger.error(f"Error creating and saving FAISS vector store: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error creating FAISS vector store")
+
+    def load_vector_store(self):
+        try:
+            # Check if the FAISS index file exists before loading
+            if not os.path.exists(self.index_path):
+                logger.error(f"FAISS index file '{self.index_path}' not found. Creating a new index might be required.")
+                raise HTTPException(status_code=404, detail="FAISS index not found")
+
+            logger.info(f"Loading FAISS index from {self.index_path}...")
+            return FAISS.load_local(self.index_path, self.embeddings, allow_dangerous_deserialization=True)
+        except FileNotFoundError:
+            logger.error(f"FAISS index file '{self.index_path}' not found.")
+            raise HTTPException(status_code=404, detail="FAISS index not found")
+        except Exception as e:
+            logger.error(f"Error loading FAISS vector store: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error loading FAISS vector store")
+
+# LLM Service
+class LLMService:
+    def __init__(self, model_id: str, client):
+        self.model_id = model_id
+        self.client = client
+
+    def initialize_llm(self):
+        try:
+            logger.info(f"Initializing LLM with model ID: {self.model_id}")
+            return BedrockLLM(model_id=self.model_id, client=self.client)
+        except Exception as e:
+            logger.error(f"Error initializing LLM: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error initializing LLM")
+
+    def generate_response(self, llm, vectorstore_faiss, query: str):
+        try:
+            start_time = time.time()
+            logger.info(f"Generating response for query: '{query}'")
+            qa = RetrievalQA.from_chain_type(
+                llm=llm,
+                chain_type="stuff",
+                retriever=vectorstore_faiss.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
+                return_source_documents=True,
+                chain_type_kwargs={"prompt": PROMPT}
+            )
+            result = qa.invoke({"query": query})
+            logger.info(f"Response generated in {time.time() - start_time:.2f} seconds.")
+            return result['result']
+        except Exception as e:
+            logger.error(f"Error generating LLM response: {e}", exc_info=True)
+            raise HTTPException(status_code=500, detail="Error generating response")