import os import re import io import json from typing import List, Tuple, Union from pathlib import Path import gradio as gr import openai import pymupdf from docx import Document from google.oauth2.service_account import Credentials from googleapiclient.discovery import build from googleapiclient.errors import HttpError HF_TOKEN = os.environ.get("HF_TOKEN", None) LEPTON_API_TOKEN = os.environ.get("LEPTON_API_TOKEN", None) # Set up Google Sheets API credentials SCOPES = ['https://www.googleapis.com/auth/spreadsheets'] SERVICE_ACCOUNT_FILE = os.environ.get("GOOGLE_SECRET_AUTH_JSON", None) SPREADSHEET_ID = os.environ.get("SPREADSHEET_ID", None) def addrow_googlesheets(model, question, document, answer, reasoning, score): try: creds = Credentials.from_service_account_info(json.loads(SERVICE_ACCOUNT_FILE), scopes=SCOPES) service = build('sheets', 'v4', credentials=creds) body = { 'values': [[model, question, document, answer, reasoning, score]] } result = service.spreadsheets().values().append( spreadsheetId=SPREADSHEET_ID, range='Sheet1!A:F', valueInputOption='RAW', insertDataOption='INSERT_ROWS', body=body ).execute() # print(f"{result.get('updates').get('updatedCells')} cells appended.") return True except HttpError as error: print(f"An error occurred: {error}") return False PROMPT = """ Given the following QUESTION, DOCUMENT and ANSWER you must analyze the provided answer and determine whether it is faithful to the contents of the DOCUMENT. The ANSWER must not offer new information beyond the context provided in the DOCUMENT. The ANSWER also must not contradict information provided in the DOCUMENT. Output your final verdict by strictly following this format: "PASS" if the answer is faithful to the DOCUMENT and "FAIL" if the answer is not faithful to the DOCUMENT. Show your reasoning. -- QUESTION (THIS DOES NOT COUNT AS BACKGROUND INFORMATION): {question} -- DOCUMENT: {document} -- ANSWER: {answer} -- Your output should be in JSON FORMAT with the keys "REASONING" and "SCORE": {{"REASONING": , "SCORE": }} """ EXAMPLES = [ { "emoji": "🏈", "question": "How many yards Hanson score with in the first?", "document": "To start the season, the Lions traveled south to Tampa, Florida to take on the Tampa Bay Buccaneers. The Lions scored first in the first quarter with a 23-yard field goal by Jason Hanson. The Buccaneers tied it up with a 38-yard field goal by Connor Barth, then took the lead when Aqib Talib intercepted a pass from Matthew Stafford and ran it in 28 yards. The Lions responded with a 28-yard field goal. In the second quarter, Detroit took the lead with a 36-yard touchdown catch by Calvin Johnson, and later added more points when Tony Scheffler caught an 11-yard TD pass. Tampa Bay responded with a 31-yard field goal just before halftime. The second half was relatively quiet, with each team only scoring one touchdown. First, Detroit's Calvin Johnson caught a 1-yard pass in the third quarter. The game's final points came when Mike Williams of Tampa Bay caught a 5-yard pass. The Lions won their regular season opener for the first time since 2007", "answer": "28" }, { "emoji": "🫁", "question": "Does timing of initial surfactant treatment make a difference in rates of chronic lung disease or mortality in premature infants?", "document": "To compare two treatment strategies in preterm infants with or at risk of respiratory distress syndrome: early surfactant administration (within one hour of birth) versus late surfactant administration, in a geographically defined population. The primary outcome was chronic lung disease (CLD) and mortality before/at 36 weeks. Secondary outcomes included: duration of mechanical ventilation and continuous positive airway pressure (CPAP), post-natal steroids for CLD and major neonatal morbidities. Premature infants born at 22-32 weeks' gestation between January 2006 and December 2009. Ten neonatal intensive care units (NICUs) in New South Wales (NSW) and Australian Capital Territory (ACT), Australia. Retrospective analysis of prospectively collected data from the regional NICU database in NSW and ACT. Of the 2170 infants who received surfactant, 1182 (54.5%) and 988 (45.5%) received early and late surfactant, respectively. The early surfactant group was less mature (27.1 ± 2.1 versus 29.4 ± 2.1 weeks) and had more CLD and mortality (40.2% versus 20.0%). The multivariable analysis showed early surfactant to be associated with less duration of ventilation, longer duration of CPAP and longer hospital stay but had little or no impact on CLD/mortality.", "answer": "Yes. Early surfactant administration significantly reduces the rates of chronic lung disease and mortality among premature infants. This supports the routine use of early surfactant treatment over alternatives like CPAP, especially in very premature infants. Further large-scale studies are still needed to solidify these findings." }, { "emoji": "🏭", "question": "Where does Vitol has its headquarter?", "document": "Information in the list relates to the most recent fiscal year (mostly FY 2022 or 2023). Rank Name Industry Revenue Profit Employees Headquarters[note 1] State-owned Ref. Revenue per worker USD millions 1 Walmart Retail Increase $611,289 $11,680 2,100,000 United States United States No [1] $291,090.00 2 Saudi Aramco Oil and gas Increase $603,651 $159,069 70,496 Saudi Arabia Saudi Arabia Yes [4] $8,562,911.37 3 Amazon Retail Increase $574,785 $30,425 1,525,000 United States United States No [5] $376,908 4 State Grid Corporation of China Electricity Increase $530,009 $8,192 870,287 China China Yes [6] $609,004.85 5 Vitol Commodities Increase $505,000 $15,000 1,560 Switzerland No [7][8] $323,717,948.72 6 China National Petroleum Corporation Oil and gas Increase $483,019 $21,080 1,087,049 China China Yes [9] $444,339.68 7 China Petrochemical Corporation Oil and gas Increase $471,154 $9,657 527,487 China China Yes [10] $893,204.95", "answer": "United States." }, { "emoji": "🩺", "question": "What does the abbreviation of VD mean in the context of students?", "document": "A sexually transmitted infection (STI), also referred to as a sexually transmitted disease (STD) and the older term venereal disease (VD), is an infection that is spread by sexual activity, especially vaginal intercourse, anal sex, oral sex, or sometimes manual sex.", "answer": "In the context of student, VD stands usually for Valedictorian, an academic title for the highest-performing student of a graduating class of an academic institution." }, { "emoji": "💻", "question": "How much market cap does Microsoft, Apple and Nvidia have together?", "document": "Here's a list of the top 10 companies by market cap in 2024 (as of February 13, 2024): Company Sector Market Cap (in USD) #1 Microsoft Technology $3.085 trillion #2 Apple Technology $2.889 trillion #3 Saudi Aramco Oil & Gas $1.997 trillion #4 Alphabet (Google) Technology $1.841 trillion #5 Amazon E-commerce $1.790 trillion #6 Nvidia Technology $1.784 trillion #7 Meta Platforms Social Media $1.195 trillion #8 Berkshire Hathaway Diversified Investments $861.74 billion #09 Eli Lilly Pharmaceuticals $699.88 billion #10 TSMC Semiconductors $676.61 billion", "answer": "They have a shared market cap of $ 7.758 trillion USD." }, { "emoji": "📈", "question": "Which company has the largest market cap in 2024?", "document": "Here's a list of the top 10 companies by market cap in 2024 (as of February 13, 2024): Company Sector Market Cap (in USD) #1 Microsoft Technology $3.085 trillion #2 Apple Technology $2.889 trillion #3 Saudi Aramco Oil & Gas $1.997 trillion #4 Alphabet (Google) Technology $1.841 trillion #5 Amazon E-commerce $1.790 trillion #6 Nvidia Technology $1.784 trillion #7 Meta Platforms Social Media $1.195 trillion #8 Berkshire Hathaway Diversified Investments $861.74 billion #09 Eli Lilly Pharmaceuticals $699.88 billion #10 TSMC Semiconductors $676.61 billion", "answer": "Alphabet has the largest market cap with $1.841 trillion USD" }, { "emoji": "🪖", "question": "When did World War II start?", "document": """World War II[b] or the Second World War was a global conflict that lasted from 1939 to 1945. The vast majority of the world's countries, including all the great powers, fought as part of two opposing military alliances: the Allies and the Axis. Many participating countries invested all available economic, industrial, and scientific capabilities into this total war, blurring the distinction between civilian and military resources. Aircraft played a major role, enabling the strategic bombing of population centres and delivery of the only two nuclear weapons ever used in war. It was by far the deadliest conflict in history, resulting in 70–85 million fatalities. Millions died due to genocides, including the Holocaust, as well as starvation, massacres, and disease. In the wake of Axis defeat, Germany, Austria, and Japan were occupied, and war crime tribunals were conducted against German and Japanese leaders. The causes of the war are debated; contributing factors included the rise of fascism in Europe, the Spanish Civil War, the Second Sino-Japanese War, Soviet–Japanese border conflicts, and tensions in the aftermath of World War I. World War II is generally considered to have begun on 1 September 1939, when Nazi Germany, under Adolf Hitler, invaded Poland. The United Kingdom and France declared war on Germany on 3 September. Under the Molotov–Ribbentrop Pact of August 1939, Germany and the Soviet Union had partitioned Poland and marked out their "spheres of influence" across Finland, Estonia, Latvia, Lithuania, and Romania. From late 1939 to early 1941, in a series of campaigns and treaties, Germany conquered or controlled much of continental Europe in a military alliance called the Axis with Italy, Japan, and other countries. Following the onset of campaigns in North and East Africa, and the fall of France in mid-1940, the war continued primarily between the European Axis powers and the British Empire, with the war in the Balkans, the aerial Battle of Britain, the Blitz of the UK, and the Battle of the Atlantic. In June 1941, Germany led the European Axis powers in an invasion of the Soviet Union, opening the Eastern Front, the largest land theatre of war in history.""", "answer": "It started on 3 September 1939 when France and United Kingdom declared war on Germany." } ] HEADER = """

Lynx 8B v1.1 & Lynx 70B

Patronus Lynx Demo

**Patronus Lynx** is a state-of-the-art open-source model for hallucination detection. **Context Window**: Lynx 8B v1.1 has a **128k** context window & Lynx 70B has a **8k** context window. **Getting Started**: Provide a question and document or context given to your model in addition to the answer given by the model and then click submit. The output panel will indicate whether the reponse is a hallucination (Fail) or if it is faithful to the given document or context (Pass) through the score Pass or Fail and provide reasoning behind the score. **File Upload**: You can choose to upload a file for the document field. The file must be a PDF, TXT, or DOCX file and within the context window of the model. """ EXAMPLES_HEADER = """ # Try it Yourself! """ UPLOADABLE_FILE_TYPES = [".pdf", ".txt", ".docx", ".doc"] css = """ .example-button { width: fit-content; font-size: 1rem; font-weight: 400 !important; padding: .5rem 1rem; text-align: start; } .fixed-height-button { height: fit-content; word-break: break-all; font-size: .85rem; } """ def update_client_base_url(model_name): if model_name == "Patronus Lynx 8B v1.1": return "https://yb15a7dy-lynx-v1-1-8b.tin.lepton.run/api/v1/" elif model_name == "Patronus Lynx 70B": return "https://yb15a7dy-patronus-lynx-70b-v1-0.tin.lepton.run/api/v1/" def parse_patronus_lynx_response( response: str, ) -> Tuple[bool, Union[List[str], None]]: """ Parses the response from the Patronus Lynx LLM and returns a tuple of: - Whether the response is hallucinated or not. - A reasoning trace explaining the decision. """ # Default to hallucinated hallucination, reasoning = True, None reasoning_pattern = r'"REASONING":\s*\[(.*?)\]' score_pattern = r'"SCORE":\s*"?\b(PASS|FAIL)\b"?' reasoning_match = re.search(reasoning_pattern, response, re.DOTALL) score_match = re.search(score_pattern, response) if score_match: score = score_match.group(1) if score == "PASS": hallucination = False if reasoning_match: reasoning_content = reasoning_match.group(1) reasoning = re.split(r"['\"],\s*['\"]", reasoning_content) return hallucination, reasoning def model_call(question, document, answer, client_base_url): client = openai.OpenAI( base_url=client_base_url, api_key=LEPTON_API_TOKEN ) if question == "" or document == "" or answer == "": return "", "" NEW_FORMAT = PROMPT.format(question=question, document=document, answer=answer) response = client.completions.create( model="gpt-3.5-turbo-instruct", prompt=NEW_FORMAT, temperature=0.0 ) hallucination, reasoning = parse_patronus_lynx_response(response.choices[0].text) score = "🔴 FAIL 🔴" if hallucination else "🟢 PASS 🟢" combined_reasoning = " ".join(reasoning)[1:-1] model = "Patronus Lynx 8B v1.1" if client_base_url=="https://yb15a7dy-lynx-v1-1-8b.tin.lepton.run/api/v1/" else "Patronus Lynx 70B" addrow_googlesheets(model, question, document, answer, combined_reasoning, score) return combined_reasoning, score def return_approximate_token_size(text): MAX_TOKEN_LENGTH = 8000 number_of_total_characters = len(text) number_of_tokens = number_of_total_characters / 4 return number_of_tokens < MAX_TOKEN_LENGTH def get_filetype(filename): return filename.split(".")[-1] def extract_text_pymupdf(file): with pymupdf.open(file) as pdf_or_txt: text = "" for page in pdf_or_txt: text += page.get_text() return text def extract_text_python_docx(file): doc = Document(file) text = "" for paragraph in doc.paragraphs: text += paragraph.text + '\n' return text.strip() def upload_file(filepath): extracted_file_text = "" if filepath is not None: name = Path(filepath).name filetype = get_filetype(name) # conditionals for filetype and function call if filetype == "pdf" or filetype == "txt": extracted_file_text = extract_text_pymupdf(filepath) elif filetype == "docx": extracted_file_text = extract_text_python_docx(filepath) # return warning if file is too large if not return_approximate_token_size(extracted_file_text): raise gr.Error("File is too large to process. Please upload a smaller file.") return [gr.UploadButton(visible=False), gr.Group(visible=True), gr.Markdown(f"**Uploaded file:** {name}"), extracted_file_text] else: return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), extracted_file_text] def reset_buttons(): return [gr.UploadButton(visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES), gr.Group(visible=False), gr.Markdown(""), gr.Textbox(value="")] def select_template(template): return template["question"], template["document"], template["answer"] # def download_file(): # return [gr.UploadButton(visible=True), gr.DownloadButton(visible=False)] with gr.Blocks(css=css, theme=gr.themes.Default(spacing_size="sm", font=[gr.themes.GoogleFont("Plus Jakarta Sans"), "Arial", "sans-serif"], primary_hue="indigo", secondary_hue="purple")) as demo: base_url_state = gr.State(update_client_base_url("Patronus Lynx 8B v1.1")) gr.Markdown(HEADER) model_dropdown = gr.Dropdown(choices=["Patronus Lynx 8B v1.1", "Patronus Lynx 70B"], value="Patronus Lynx 8B v1.1", label="Model", interactive=True) with gr.Row(equal_height=True): with gr.Column(scale=1): gr.Markdown("**Your Inputs**") # with gr.Row(): question = gr.Textbox(label="Question") # with gr.Row(): document = gr.Textbox(label="Document", scale=4) with gr.Row(): u = gr.UploadButton("Upload", visible=True, file_count="single", file_types=UPLOADABLE_FILE_TYPES, scale=1, elem_classes="fixed-height-button") file_group = gr.Group(elem_classes="fixed-height-button", visible=False) with file_group: file_name = gr.Markdown("") c = gr.ClearButton([u, file_name]) # with gr.Row(): answer = gr.Textbox(label="Answer") with gr.Row(): clear_btn = gr.ClearButton([question, document, answer]) submit_button = gr.Button("Submit", variant="primary") with gr.Column(scale=1): gr.Markdown("**Model Outputs**") reasoning = gr.Textbox(label="Reasoning") score = gr.Textbox(label="Score (FAIL if Hallucinated, PASS if not)") gr.Markdown(" ") gr.Markdown(EXAMPLES_HEADER) with gr.Row(): with gr.Column(): for _, example in enumerate(EXAMPLES): template_btn = gr.Button(f"{example['emoji']} {example['question']}", elem_classes="example-button") template_btn.click( fn=select_template, inputs=[gr.State(example)], outputs=[question, document, answer] ) model_dropdown.change(fn=update_client_base_url, inputs=[model_dropdown], outputs=[base_url_state]) u.upload(upload_file, u, [u, file_group, file_name, document]) c.click(reset_buttons, None, [u, file_group, file_name, document]) # d.click(download_file, None, [u, d]) submit_button.click(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score]) question.submit(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score]) document.submit(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score]) answer.submit(fn=model_call, inputs=[question, document, answer, base_url_state], outputs=[reasoning, score]) demo.launch()