Vishnu-add commited on
Commit
677b005
1 Parent(s): 9f2c590

Upload 17 files

Browse files
.gitattributes ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ db/ filter=lfs diff=lfs merge=lfs -text
2
+ # LaMini-T5-738M/ filter=lfs diff=lfs merge=lfs -text
3
+ *.sqlite3 filter=lfs diff=lfs merge=lfs -text
4
+ *.bin filter=lfs diff=lfs merge=lfs -text
5
+ # HF
6
+ *.7z filter=lfs diff=lfs merge=lfs -text
7
+ *.arrow filter=lfs diff=lfs merge=lfs -text
8
+ *.bin filter=lfs diff=lfs merge=lfs -text
9
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
10
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
11
+ *.ftz filter=lfs diff=lfs merge=lfs -text
12
+ *.gz filter=lfs diff=lfs merge=lfs -text
13
+ *.h5 filter=lfs diff=lfs merge=lfs -text
14
+ *.joblib filter=lfs diff=lfs merge=lfs -text
15
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
16
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
17
+ *.model filter=lfs diff=lfs merge=lfs -text
18
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
19
+ *.npy filter=lfs diff=lfs merge=lfs -text
20
+ *.npz filter=lfs diff=lfs merge=lfs -text
21
+ *.onnx filter=lfs diff=lfs merge=lfs -text
22
+ *.ot filter=lfs diff=lfs merge=lfs -text
23
+ *.parquet filter=lfs diff=lfs merge=lfs -text
24
+ *.pb filter=lfs diff=lfs merge=lfs -text
25
+ *.pickle filter=lfs diff=lfs merge=lfs -text
26
+ *.pkl filter=lfs diff=lfs merge=lfs -text
27
+ *.pt filter=lfs diff=lfs merge=lfs -text
28
+ *.pth filter=lfs diff=lfs merge=lfs -text
29
+ *.rar filter=lfs diff=lfs merge=lfs -text
30
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
31
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
33
+ *.tar filter=lfs diff=lfs merge=lfs -text
34
+ *.tflite filter=lfs diff=lfs merge=lfs -text
35
+ *.tgz filter=lfs diff=lfs merge=lfs -text
36
+ *.wasm filter=lfs diff=lfs merge=lfs -text
37
+ *.xz filter=lfs diff=lfs merge=lfs -text
38
+ *.zip filter=lfs diff=lfs merge=lfs -text
39
+ *.zst filter=lfs diff=lfs merge=lfs -text
40
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
41
+ *.pdf filter=lfs diff=lfs merge=lfs -text
42
+ *.pickel filter=lfs diff=lfs merge=lfs -text
43
+ db/*.pickle filter=lfs diff=lfs merge=lfs -text
44
+ db/c811917d-8276-48ba-b913-6ed6196f4484/index_metadata.pickle filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__/
2
+ # Lib/
3
+ search_pdf_env/
4
+ LaMini-T5-738M/
5
+ # db/
Commands.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Youtube video : https://youtu.be/rIV1EseKwU4?si=YOJ2a_9eYVPhxn6X
2
+ Github : https://github.com/AIAnytime/Search-Your-PDF-App/tree/main
3
+ LLM : https://huggingface.co/MBZUAI/LaMini-T5-738M
4
+
5
+
6
+ NOTE: Remove the chroma settings from the code to work with latest versions
7
+
8
+
9
+
10
+ 1) Creating a virtual env
11
+ python -m venv <env_name>
12
+
13
+ 2) Activating virtual environment
14
+ search_pdf_env\Scripts\activate
15
+
16
+ 3)Installing requirements:
17
+ pipi nstall -r requirements.txt
README.md CHANGED
@@ -1,27 +1,10 @@
1
- ---
2
- license: mit
3
- title: Chat With Doc
4
- sdk: streamlit
5
- emoji: 🏃
6
- colorFrom: gray
7
- colorTo: pink
8
- ---
9
  metadata
10
-
11
  title: Chat With Doc
12
-
13
  emoji: 😻
14
-
15
  colorFrom: gray
16
-
17
  colorTo: pink
18
-
19
  sdk: streamlit
20
-
21
  sdk_version: 1.29.0
22
-
23
  app_file: app.py
24
-
25
  pinned: false
26
-
27
  license: mit
 
 
 
 
 
 
 
 
 
1
  metadata
 
2
  title: Chat With Doc
 
3
  emoji: 😻
 
4
  colorFrom: gray
 
5
  colorTo: pink
 
6
  sdk: streamlit
 
7
  sdk_version: 1.29.0
 
8
  app_file: app.py
 
9
  pinned: false
 
10
  license: mit
app.ipynb ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "!pip install -r requirements.txt\n",
10
+ "!pip install pyngrok"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": null,
16
+ "metadata": {},
17
+ "outputs": [],
18
+ "source": [
19
+ "!streamlit run app.py &>\"/content/drive/MyDrive/Colab Notebooks/LangChatbot/MAIN PROJECT - Langchain - STREAMLIT/KEFY_BOT_Langchain_streamlit/logs_streamlit.txt\" &\n"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": null,
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "!ngrok config add-authtoken 2Z7XecBchSB7U8OxYamQIBoDH4F_7huod8eqNPzz6W5hgu1Uz"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": null,
34
+ "metadata": {},
35
+ "outputs": [],
36
+ "source": [
37
+ "from pyngrok import ngrok\n",
38
+ "ngrok_tunnel = ngrok.connect(8502)\n",
39
+ "print('Public URL:', ngrok_tunnel.public_url)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "metadata": {},
46
+ "outputs": [],
47
+ "source": [
48
+ "ngrok.kill()"
49
+ ]
50
+ }
51
+ ],
52
+ "metadata": {
53
+ "language_info": {
54
+ "name": "python"
55
+ }
56
+ },
57
+ "nbformat": 4,
58
+ "nbformat_minor": 2
59
+ }
app.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from transformers import pipeline
4
+ import torch
5
+ import base64
6
+ import textwrap
7
+ from langchain.embeddings import SentenceTransformerEmbeddings
8
+ from langchain.vectorstores import Chroma
9
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
10
+ from langchain.chains import RetrievalQA
11
+
12
+ @st.cache_resource
13
+ def get_model():
14
+ # device = torch.device('cpu')
15
+ device = torch.device('cuda:0')
16
+
17
+ checkpoint = "LaMini-T5-738M"
18
+ checkpoint = "MBZUAI/LaMini-T5-738M"
19
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
20
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(
21
+ checkpoint,
22
+ device_map=device,
23
+ torch_dtype = torch.float32,
24
+ # offload_folder= "/model_ck"
25
+ )
26
+ return base_model,tokenizer
27
+
28
+ @st.cache_resource
29
+ def llm_pipeline():
30
+ base_model,tokenizer = get_model()
31
+ pipe = pipeline(
32
+ 'text2text-generation',
33
+ model = base_model,
34
+ tokenizer=tokenizer,
35
+ max_length = 256,
36
+ do_sample = True,
37
+ temperature = 0.3,
38
+ top_p = 0.95,
39
+ device=device
40
+ )
41
+
42
+ local_llm = HuggingFacePipeline(pipeline = pipe)
43
+ return local_llm
44
+
45
+ @st.cache_resource
46
+ def qa_llm():
47
+ llm = llm_pipeline()
48
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
49
+ db = Chroma(persist_directory="db", embedding_function = embeddings)
50
+ retriever = db.as_retriever()
51
+ qa = RetrievalQA.from_chain_type(
52
+ llm=llm,
53
+ chain_type = "stuff",
54
+ retriever = retriever,
55
+ return_source_documents=True
56
+ )
57
+ return qa
58
+
59
+
60
+ def process_answer(instruction):
61
+ response=''
62
+ instruction = instruction
63
+ qa = qa_llm()
64
+ generated_text = qa(instruction)
65
+ answer = generated_text['result']
66
+ return answer, generated_text
67
+
68
+ def main():
69
+ st.title("Search your pdf📚")
70
+ with st.expander("About the App"):
71
+ st.markdown(
72
+ """This is a Generative AI powered Question and Answering app that responds to questions about your PDF file.
73
+ """
74
+ )
75
+
76
+ question = st.text_area("Enter Your Question")
77
+ if st.button("Search"):
78
+ st.info("Your question: "+question)
79
+ st.info("Your Answer")
80
+ answer, metadata = process_answer(question)
81
+ st.write(answer)
82
+ st.write(metadata)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ main()
chatbot.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
3
+ from transformers import pipeline
4
+ import torch
5
+ import base64
6
+ import textwrap
7
+ from langchain.embeddings import SentenceTransformerEmbeddings
8
+ from langchain.vectorstores import Chroma
9
+ from langchain.llms.huggingface_pipeline import HuggingFacePipeline
10
+ from langchain.chains import RetrievalQA
11
+ from streamlit_chat import message
12
+
13
+ # device = torch.device('cpu')
14
+ device = torch.device('cuda:0')
15
+
16
+
17
+ checkpoint = "LaMini-T5-738M"
18
+ tokenizer = AutoTokenizer.from_pretrained(checkpoint)
19
+ base_model = AutoModelForSeq2SeqLM.from_pretrained(
20
+ checkpoint,
21
+ device_map=device,
22
+ torch_dtype = torch.float32,
23
+ # offload_folder= "/model_ck"
24
+ )
25
+
26
+ @st.cache_resource
27
+ def llm_pipeline():
28
+ pipe = pipeline(
29
+ 'text2text-generation',
30
+ model = base_model,
31
+ tokenizer=tokenizer,
32
+ max_length = 256,
33
+ do_sample = True,
34
+ temperature = 0.3,
35
+ top_p = 0.95,
36
+ )
37
+
38
+ local_llm = HuggingFacePipeline(pipeline = pipe)
39
+ return local_llm
40
+
41
+ @st.cache_resource
42
+ def qa_llm():
43
+ llm = llm_pipeline()
44
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
45
+ db = Chroma(persist_directory="db", embedding_function = embeddings)
46
+ retriever = db.as_retriever()
47
+ qa = RetrievalQA.from_chain_type(
48
+ llm=llm,
49
+ chain_type = "stuff",
50
+ retriever = retriever,
51
+ return_source_documents=True
52
+ )
53
+ return qa
54
+
55
+
56
+ def process_answer(instruction):
57
+ response=''
58
+ instruction = instruction
59
+ qa = qa_llm()
60
+ generated_text = qa(instruction)
61
+ answer = generated_text['result']
62
+ return answer, generated_text
63
+
64
+ # Display conversation history using Streamlit messages
65
+ def display_conversation(history):
66
+ for i in range(len(history["generated"])):
67
+ message(history["past"][i] , is_user=True, key= str(i) + "_user")
68
+ message(history["generated"][i] , key= str(i))
69
+
70
+
71
+ def main():
72
+ st.title("Chat with your pdf📚")
73
+ with st.expander("About the App"):
74
+ st.markdown(
75
+ """
76
+ This is a Generative AI powered Question and Answering app that responds to questions about your PDF file.
77
+ """
78
+ )
79
+
80
+ user_input = st.text_input("",key="input")
81
+
82
+ # Initialize session state for generated responses and past messages
83
+ if "generated" not in st.session_state:
84
+ st.session_state["generated"] = ["I am ready to help you"]
85
+ if "past" not in st.session_state:
86
+ st.session_state["past"] = ["Hey There!"]
87
+
88
+ # Search the database for a response based on user input and update session state
89
+ if user_input:
90
+ answer = process_answer({"query" : user_input})
91
+ st.session_state["past"].append(user_input)
92
+ response = answer
93
+ st.session_state["generated"].append(response)
94
+
95
+ # Display Conversation history using Streamlit messages
96
+ if st.session_state["generated"]:
97
+ display_conversation(st.session_state)
98
+
99
+
100
+
101
+ if __name__ == "__main__":
102
+ main()
constants.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from chromadb.config import Settings
3
+
4
+ # Define Chroma Settings
5
+ CHROMA_SETTINGS = Settings(
6
+ chroma_db_impl = 'duckdb+parquet' ,
7
+ persist_directory = "db",
8
+ anonymized_telemetry = False
9
+ )
db/c811917d-8276-48ba-b913-6ed6196f4484/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0656652b4f3db81247ca6f4a0365416da3b66a0ed0cd46e9392400ee92da06ef
3
+ size 62012000
db/c811917d-8276-48ba-b913-6ed6196f4484/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44c6e025ebb371f800e844ce62d9b7dde9b123633b5d9e3bf6199de9a6580582
3
+ size 100
db/c811917d-8276-48ba-b913-6ed6196f4484/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b13caae7bf03a47b0bc51c04f39eb07ffdc234fe6b7f369b872a2447117da8
3
+ size 2144478
db/c811917d-8276-48ba-b913-6ed6196f4484/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fd7fddbb7246719bc06423736fe0cebe9b417bdb555ae72f6061248bc1e995
3
+ size 148000
db/c811917d-8276-48ba-b913-6ed6196f4484/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9fbff72c999b684e5ef2d0dfbeb81e5179ca48fa5c62b8ccadf3ef53f2561744
3
+ size 317184
db/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c5ae7212513205065174fc77e7fd813e803de0635f4fb32947eeeb2fbb067cf
3
+ size 264290304
docs/Alfred V. Aho, Monica S. Lam, Ravi Sethi, Jeffrey D. Ullman-Compilers - Principles, Techniques, and Tools-Pearson_Addison Wesley (2006).pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92646e7788a17653fbcd9aaf16724ae62e67b4990f4289ee39ca55e5fb9ab62a
3
+ size 6060190
ingest.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader, PDFMinerLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ from langchain.embeddings import SentenceTransformerEmbeddings
4
+ from langchain.vectorstores import Chroma
5
+ import os
6
+ from constants import CHROMA_SETTINGS
7
+
8
+ persist_directory = "db"
9
+
10
+ def main():
11
+ for root, dirs, files in os.walk("docs"):
12
+ for file in files:
13
+ if file.endswith(".pdf"):
14
+ print(file)
15
+ loader = PDFMinerLoader(os.path.join(root, file))
16
+ documents = loader.load()
17
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=500)
18
+ texts = text_splitter.split_documents(documents)
19
+ # create embeddings
20
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
21
+ # create vector store
22
+ db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)
23
+ db.persist()
24
+ db=None
25
+
26
+ if __name__ == "__main__":
27
+ main()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ streamlit
3
+ transformers
4
+ requests
5
+ torch
6
+ einops
7
+ accelerate
8
+ bitsandbytes
9
+ pdfminer.six
10
+ bs4
11
+ sentence-transformers
12
+ chromadb
13
+ torchvision
14
+ torchaudio
15
+ sentencepiece
16
+ requests
17
+ uvicorn
18
+ streamlit-chat