Files changed (1) hide show
  1. embedding.py +254 -255
embedding.py CHANGED
@@ -1,255 +1,254 @@
1
- import requests
2
- import json
3
- import os
4
- import concurrent.futures
5
- import random
6
- from langchain_google_genai import ChatGoogleGenerativeAI
7
- from langchain_community.document_loaders import WebBaseLoader
8
- from langchain_community.document_loaders import PyPDFLoader
9
- from langchain.text_splitter import RecursiveCharacterTextSplitter
10
- import google.generativeai as genai
11
-
12
-
13
- gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
14
- gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
15
- gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
16
- gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
17
-
18
- genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
19
-
20
-
21
- def pdf_extractor(link):
22
- text = ''
23
-
24
- try:
25
- loader = PyPDFLoader(link)
26
- pages = loader.load_and_split()
27
-
28
- for page in pages:
29
- text+=page.page_content
30
- except:
31
- pass
32
-
33
- return [text]
34
-
35
- def web_extractor(link):
36
- text = ''
37
-
38
- try:
39
- loader = WebBaseLoader(link)
40
- pages = loader.load_and_split()
41
-
42
- for page in pages:
43
- text+=page.page_content
44
- except:
45
- pass
46
-
47
- return [text]
48
-
49
-
50
- def feature_extraction(tag, history , context):
51
-
52
- prompt = f'''
53
- You are an intelligent assistant tasked with updating product information. You have two data sources:
54
- 1. Tag_History: Previously gathered information about the product.
55
- 2. Tag_Context: New data that might contain additional details.
56
-
57
- Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
58
-
59
- Guidelines:
60
- - Only add new details that are relevant to the {tag} FIELD.
61
- - Do not add or modify any other fields in the Tag_History.
62
- - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
63
-
64
- Here is the data:
65
-
66
- Tag_Context: {str(context)}
67
- Tag_History: {history}
68
-
69
- Respond with the updated Tag_History.
70
- '''
71
-
72
- model = random.choice([gemini,gemini1])
73
- result = model.invoke(prompt)
74
-
75
- return result.content
76
-
77
- def detailed_feature_extraction(find, context):
78
-
79
- prompt = f'''
80
- You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
81
- 1. Context: The gathered information about the product.
82
- 2. Format: Details which need to be filled based on Context.
83
-
84
- Your job is to read the Context and update the relevant field in Format using Context.
85
-
86
- Guidelines:
87
- - Only add details that are relevant to the individual FIELD.
88
- - Do not add or modify any other fields in the Format.
89
- - If nothing found return None.
90
-
91
- Here is the data:
92
-
93
- The Context is {str(context)}
94
- The Format is {str(find)}
95
- '''
96
-
97
- model = random.choice([gemini,gemini1,gemini2,gemini3])
98
- result = model.invoke(prompt)
99
-
100
- return result.content
101
-
102
- def detailed_history(history):
103
-
104
- details = {
105
- "Introduction": {
106
- "Product Name": None,
107
- "Overview of the product": None,
108
- "Purpose of the manual": None,
109
- "Audience": None,
110
- "Additional Details": None
111
- },
112
- "Specifications": {
113
- "Technical specifications": None,
114
- "Performance metrics": None,
115
- "Additional Details": None
116
- },
117
- "Product Overview": {
118
- "Product features": None,
119
- "Key components and parts": None,
120
- "Additional Details": None
121
- },
122
- "Safety Information": {
123
- "Safety warnings and precautions": None,
124
- "Compliance and certification information": None,
125
- "Additional Details": None
126
- },
127
- "Installation Instructions": {
128
- "Unboxing and inventory checklist": None,
129
- "Step-by-step installation guide": None,
130
- "Required tools and materials": None,
131
- "Additional Details": None
132
- },
133
- "Setup and Configuration": {
134
- "Initial setup procedures": None,
135
- "Configuration settings": None,
136
- "Troubleshooting setup issues": None,
137
- "Additional Details": None
138
- },
139
- "Operation Instructions": {
140
- "How to use the product": None,
141
- "Detailed instructions for different functionalities": None,
142
- "User interface guide": None,
143
- "Additional Details": None
144
- },
145
- "Maintenance and Care": {
146
- "Cleaning instructions": None,
147
- "Maintenance schedule": None,
148
- "Replacement parts and accessories": None,
149
- "Additional Details": None
150
- },
151
- "Troubleshooting": {
152
- "Common issues and solutions": None,
153
- "Error messages and their meanings": None,
154
- "Support Information": None,
155
- "Additional Details": None
156
- },
157
- "Warranty Information": {
158
- "Terms and Conditions": None,
159
- "Service and repair information": None,
160
- "Additional Details": None
161
- },
162
- "Legal Information": {
163
- "Copyright information": None,
164
- "Trademarks and patents": None,
165
- "Disclaimers": None,
166
- "Additional Details": None
167
-
168
- }
169
- }
170
-
171
- for key,val in history.items():
172
-
173
- find = details[key]
174
-
175
- details[key] = str(detailed_feature_extraction(find,val))
176
-
177
- return details
178
-
179
-
180
- def get_embeddings(link):
181
-
182
- print(f"\nCreating Embeddings ----- {link}")
183
- history = {
184
- "Introduction": "",
185
- "Specifications": "",
186
- "Product Overview": "",
187
- "Safety Information": "",
188
- "Installation Instructions": "",
189
- "Setup and Configuration": "",
190
- "Operation Instructions": "",
191
- "Maintenance and Care": "",
192
- "Troubleshooting": "",
193
- "Warranty Information": "",
194
- "Legal Information": ""
195
- }
196
-
197
- # Extract Text -----------------------------
198
- print("Extracting Text")
199
- if link[-3:] == '.md' or link[8:11] == 'en.':
200
- text = web_extractor(link)
201
- else:
202
- text = pdf_extractor(link)
203
-
204
- # Create Chunks ----------------------------
205
- print("Writing Tag Data")
206
- chunks = text_splitter.create_documents(text)
207
-
208
- for chunk in chunks:
209
-
210
- with concurrent.futures.ThreadPoolExecutor() as executor:
211
- future_to_key = {
212
- executor.submit(
213
- feature_extraction, f"Product {key}", history[key], chunk.page_content
214
- ): key for key in history
215
- }
216
- for future in concurrent.futures.as_completed(future_to_key):
217
- key = future_to_key[future]
218
- try:
219
- response = future.result()
220
- history[key] = response
221
- except Exception as e:
222
- print(f"Error processing {key}: {e}")
223
-
224
- # history = detailed_history(history)
225
- print("Creating Vectors")
226
- print(history)
227
- genai_embeddings=[]
228
-
229
- for tag in history:
230
- try:
231
- result = genai.embed_content(
232
- model="models/embedding-001",
233
- content=history[tag],
234
- task_type="retrieval_document")
235
- genai_embeddings.append(result['embedding'])
236
- except:
237
- genai_embeddings.append([0]*768)
238
-
239
-
240
- return history,genai_embeddings
241
-
242
- global text_splitter
243
- global data
244
- global history
245
-
246
-
247
- text_splitter = RecursiveCharacterTextSplitter(
248
- chunk_size = 10000,
249
- chunk_overlap = 100,
250
- separators = ["",''," "]
251
- )
252
-
253
-
254
- if __name__ == '__main__':
255
- pass
 
1
+ from PyPDF2 import PdfReader
2
+ import requests
3
+ import json
4
+ import os
5
+ import concurrent.futures
6
+ import random
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+ from langchain_community.document_loaders import WebBaseLoader
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
11
+ import google.generativeai as genai
12
+ from io import BytesIO
13
+
14
+
15
+
16
+ gemini = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA',temperature = 0.1)
17
+ gemini1 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyABsaDjPujPCBlz4LLxcXDX_bDA9uEL7Xc',temperature = 0.1)
18
+ gemini2 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBCIQgt1uK7-sJH5Afg5vUZ99EWkx5gSU0',temperature = 0.1)
19
+ gemini3 = ChatGoogleGenerativeAI(model="gemini-1.0-pro-001",google_api_key='AIzaSyBot9W5Q-BKQ66NAYRUmVeloXWEbXOXTmM',temperature = 0.1)
20
+
21
+ genai.configure(api_key="AIzaSyBmZtXjJgp7yIAo9joNCZGSxK9PbGMcVaA")
22
+
23
+
24
+ def pdf_extractor(link):
25
+ text = ''
26
+
27
+ try:
28
+ # Fetch the PDF file from the URL
29
+ response = requests.get(link)
30
+ response.raise_for_status() # Raise an error for bad status codes
31
+
32
+ # Use BytesIO to handle the PDF content in memory
33
+ pdf_file = BytesIO(response.content)
34
+
35
+ # Load the PDF file
36
+ reader = PdfReader(pdf_file)
37
+ for page in reader.pages:
38
+ text += page.extract_text() # Extract text from each page
39
+
40
+ except requests.exceptions.HTTPError as e:
41
+ print(f'HTTP error occurred: {e}')
42
+ except Exception as e:
43
+ print(f'An error occurred: {e}')
44
+
45
+ return [text]
46
+
47
+ def web_extractor(link):
48
+ text = ''
49
+
50
+ try:
51
+ loader = WebBaseLoader(link)
52
+ pages = loader.load_and_split()
53
+
54
+ for page in pages:
55
+ text+=page.page_content
56
+ except:
57
+ pass
58
+
59
+ return [text]
60
+
61
+
62
+ def feature_extraction(tag, history , context):
63
+
64
+ prompt = f'''
65
+ You are an intelligent assistant tasked with updating product information. You have two data sources:
66
+ 1. Tag_History: Previously gathered information about the product.
67
+ 2. Tag_Context: New data that might contain additional details.
68
+ Your job is to read the Tag_Context and update the relevant field in the Tag_History with any new details found. The field to be updated is the {tag} FIELD.
69
+ Guidelines:
70
+ - Only add new details that are relevant to the {tag} FIELD.
71
+ - Do not add or modify any other fields in the Tag_History.
72
+ - Ensure your response is in coherent sentences, integrating the new details seamlessly into the existing information.
73
+ Here is the data:
74
+ Tag_Context: {str(context)}
75
+ Tag_History: {history}
76
+ Respond with the updated Tag_History.
77
+ '''
78
+
79
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
80
+ result = model.invoke(prompt)
81
+
82
+ return result.content
83
+
84
+ def detailed_feature_extraction(find, context):
85
+
86
+ prompt = f'''
87
+ You are an intelligent assistant tasked with finding product information. You have one data source and one output format:
88
+ 1. Context: The gathered information about the product.
89
+ 2. Format: Details which need to be filled based on Context.
90
+ Your job is to read the Context and update the relevant field in Format using Context.
91
+ Guidelines:
92
+ - Only add details that are relevant to the individual FIELD.
93
+ - Do not add or modify any other fields in the Format.
94
+ - If nothing found return None.
95
+ Here is the data:
96
+ The Context is {str(context)}
97
+ The Format is {str(find)}
98
+ '''
99
+
100
+ model = random.choice([gemini,gemini1,gemini2,gemini3])
101
+ result = model.invoke(prompt)
102
+
103
+ return result.content
104
+
105
+ def detailed_history(history):
106
+
107
+ details = {
108
+ "Introduction": {
109
+ "Product Name": None,
110
+ "Overview of the product": None,
111
+ "Purpose of the manual": None,
112
+ "Audience": None,
113
+ "Additional Details": None
114
+ },
115
+ "Specifications": {
116
+ "Technical specifications": None,
117
+ "Performance metrics": None,
118
+ "Additional Details": None
119
+ },
120
+ "Product Overview": {
121
+ "Product features": None,
122
+ "Key components and parts": None,
123
+ "Additional Details": None
124
+ },
125
+ "Safety Information": {
126
+ "Safety warnings and precautions": None,
127
+ "Compliance and certification information": None,
128
+ "Additional Details": None
129
+ },
130
+ "Installation Instructions": {
131
+ "Unboxing and inventory checklist": None,
132
+ "Step-by-step installation guide": None,
133
+ "Required tools and materials": None,
134
+ "Additional Details": None
135
+ },
136
+ "Setup and Configuration": {
137
+ "Initial setup procedures": None,
138
+ "Configuration settings": None,
139
+ "Troubleshooting setup issues": None,
140
+ "Additional Details": None
141
+ },
142
+ "Operation Instructions": {
143
+ "How to use the product": None,
144
+ "Detailed instructions for different functionalities": None,
145
+ "User interface guide": None,
146
+ "Additional Details": None
147
+ },
148
+ "Maintenance and Care": {
149
+ "Cleaning instructions": None,
150
+ "Maintenance schedule": None,
151
+ "Replacement parts and accessories": None,
152
+ "Additional Details": None
153
+ },
154
+ "Troubleshooting": {
155
+ "Common issues and solutions": None,
156
+ "Error messages and their meanings": None,
157
+ "Support Information": None,
158
+ "Additional Details": None
159
+ },
160
+ "Warranty Information": {
161
+ "Terms and Conditions": None,
162
+ "Service and repair information": None,
163
+ "Additional Details": None
164
+ },
165
+ "Legal Information": {
166
+ "Copyright information": None,
167
+ "Trademarks and patents": None,
168
+ "Disclaimers": None,
169
+ "Additional Details": None
170
+
171
+ }
172
+ }
173
+
174
+ for key,val in history.items():
175
+
176
+ find = details[key]
177
+
178
+ details[key] = str(detailed_feature_extraction(find,val))
179
+
180
+ return details
181
+
182
+
183
+ def get_embeddings(link):
184
+
185
+ print(f"\nCreating Embeddings ----- {link}")
186
+ history = {
187
+ "Introduction": "",
188
+ "Specifications": "",
189
+ "Product Overview": "",
190
+ "Safety Information": "",
191
+ "Installation Instructions": "",
192
+ "Setup and Configuration": "",
193
+ "Operation Instructions": "",
194
+ "Maintenance and Care": "",
195
+ "Troubleshooting": "",
196
+ "Warranty Information": "",
197
+ "Legal Information": ""
198
+ }
199
+
200
+ # Extract Text -----------------------------
201
+ print("Extracting Text")
202
+ if link[-3:] == '.md' or link[8:11] == 'en.':
203
+ text = web_extractor(link)
204
+ else:
205
+ text = pdf_extractor(link)
206
+
207
+ # Create Chunks ----------------------------
208
+ print("Writing Tag Data")
209
+ chunks = text_splitter.create_documents(text)
210
+
211
+ for chunk in chunks:
212
+
213
+ with concurrent.futures.ThreadPoolExecutor() as executor:
214
+ future_to_key = {
215
+ executor.submit(
216
+ feature_extraction, f"Product {key}", history[key], chunk.page_content
217
+ ): key for key in history
218
+ }
219
+ for future in concurrent.futures.as_completed(future_to_key):
220
+ key = future_to_key[future]
221
+ try:
222
+ response = future.result()
223
+ history[key] = response
224
+ except Exception as e:
225
+ print(f"Error processing {key}: {e}")
226
+
227
+ # history = detailed_history(history)
228
+ print("Creating Vectors")
229
+ genai_embeddings=[]
230
+
231
+ for tag in history:
232
+ result = genai.embed_content(
233
+ model="models/embedding-001",
234
+ content=history[tag],
235
+ task_type="retrieval_document")
236
+ genai_embeddings.append(result['embedding'])
237
+
238
+
239
+ return history,genai_embeddings
240
+
241
+ global text_splitter
242
+ global data
243
+ global history
244
+
245
+
246
+ text_splitter = RecursiveCharacterTextSplitter(
247
+ chunk_size = 10000,
248
+ chunk_overlap = 100,
249
+ separators = ["",''," "]
250
+ )
251
+
252
+
253
+ if __name__ == '__main__':
254
+ pass