arasu088 commited on
Commit
ff71e02
1 Parent(s): ebddcd4

Upload helper.py

Browse files
Files changed (1) hide show
  1. helper.py +8 -9
helper.py CHANGED
@@ -42,8 +42,8 @@ llm = HuggingFacePipeline(pipeline=pipe)
42
  # # Initialize instructor embeddings using the Hugging Face model
43
  # instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="C:/Users/arasu/Workspace/Projects/GenAI/embeddings/hkunlp_instructor-large")
44
  instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
 
45
  vector_db = ""
46
-
47
  def create_vector_db():
48
  # Load data from pdf
49
  raw_text = ""
@@ -53,14 +53,13 @@ def create_vector_db():
53
  chunk_overlap = 100,
54
  length_function = len,
55
  )
56
- for root, dirs, files in os.walk("docs"):
57
- for file in files:
58
- if file.endswith(".pdf"):
59
- pdf = PdfReader("./docs/"+file)
60
- for i, page in enumerate(pdf.pages):
61
- content = page.extract_text()
62
- if content:
63
- raw_text += content
64
  texts = text_splitter.split_text(raw_text)
65
 
66
  # Create a vector database from 'text'
 
42
  # # Initialize instructor embeddings using the Hugging Face model
43
  # instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="C:/Users/arasu/Workspace/Projects/GenAI/embeddings/hkunlp_instructor-large")
44
  instructor_embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
45
+ # db_path = "vector_db"
46
  vector_db = ""
 
47
  def create_vector_db():
48
  # Load data from pdf
49
  raw_text = ""
 
53
  chunk_overlap = 100,
54
  length_function = len,
55
  )
56
+ from PyPDF2 import PdfReader
57
+ pdf = PdfReader("employment-agreement2018.pdf")
58
+ raw_text = ""
59
+ for i, page in enumerate(pdf.pages):
60
+ content = page.extract_text()
61
+ if content:
62
+ raw_text += content
 
63
  texts = text_splitter.split_text(raw_text)
64
 
65
  # Create a vector database from 'text'