from langchain.llms import HuggingFacePipeline from langchain.embeddings import HuggingFaceInstructEmbeddings from langchain.chains import RetrievalQA from transformers import ( AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, GenerationConfig ) from textwrap import dedent class lamini: def __init__(self): pass def load_model(self, task="text2text-generation", **kwargs) -> HuggingFacePipeline: """Returns a pipeline for the model - model: MBZUAI/LaMini-Flan-T5-248M Returns: _type_: _description_ """ model_id = "MBZUAI/LaMini-Flan-T5-248M" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForSeq2SeqLM.from_pretrained(model_id) gen_config = GenerationConfig.from_pretrained(model_id) max_length = kwargs.get("max_length", 512) temperature = kwargs.get("temperature", 0) top_p = kwargs.get("top_p", 0.95) repetition_penalty = kwargs.get("repetition_penalty", 1.15) pipe = pipeline( "text2text-generation", model=model, tokenizer=tokenizer, generation_config=gen_config, max_length=max_length, top_p=top_p, temperature=temperature, repetition_penalty=repetition_penalty, ) llm = HuggingFacePipeline(pipeline=pipe) return llm class templates: def __init__(self, llm: HuggingFacePipeline): self.llm = llm def summarize(self, text, **kwargs): """Summarize text Args: text (str): text to summarize Returns: str: summarized text """ instruction = "summarize for better understanding: " text = instruction + text return self.llm(text, **kwargs) def generate_tile(self, text, **kwargs): """Generate a title for text Args: text (str): text to generate title for Returns: str: title """ instruction = "generate a title for this text: " text = instruction + text return self.llm(text, **kwargs) class qa_template: def __init__(self, llm): from langchain.chains.retrieval_qa.base import BaseRetrievalQA self.llm = llm self.qa_inf: BaseRetrievalQA def load(self, knowledge_base): """Load knowledge base Args: knowledge_base (str): knowledge base to load Returns: BaseRetrievalQA: (optional to use) returns QA interface """ from utils import LangChainChunker from langchain.vectorstores import Chroma from langchain.chains import RetrievalQA embeds = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large") chunker = LangChainChunker(knowledge_base) chunks = chunker.chunker(size=512) db = Chroma.from_texts(chunks, embeds) retriever = db.as_retriever() qa_inf = RetrievalQA.from_chain_type( llm=self.llm, chain_type="stuff", retriever=retriever ) self.qa_inf = qa_inf return qa_inf def start_gradio(self, title: str): """Start gradio interface Returns: _type_: _description_ """ import gradio as gr load = self.load def interface(msg, history): res = history.append((msg, res)) return "", history def reload(video_id): from utils import getSubsText print(f"Setting up {video_id}") subs = getSubsText(video_id) _ = load(subs) with gr.Blocks() as demo: with gr.Column(): gr.Markdown(dedent(f""" # video to QA A test implementation to use vectorstores and mini llms to create a question answer chatbot interface for _youtube videos_ """)) chatbot = gr.Chatbot() with gr.Row(): with gr.Column(): videoId = gr.Textbox(label="Video ID", placeholder="Enter video ID here") msg = gr.Textbox(label="Question Box" , placeholder="Enter your question here") clear = gr.ClearButton([msg, videoId, chatbot]) gr.Markdown( dedent(""" ## Getting started to start up you need to enter the video ID of youtube video first Get a youtube video which has English dialog > ex: in this `BsnCpESUEqM` is the video ID ``` ^^^^^^^^^^^ video_id ``` > in url paramets are seperated by `?` and for video id its `?v` copy-paste the video id to the textbox and press return/enter and wait ~5 seconds to fetch video information --- Now in the Question Box _box_/feild start typing the quesions and press return/enter to send to llm """) ) msg.submit(interface, [msg, chatbot], [msg, chatbot]) videoId.submit(reload, [videoId]) # ui = gr.ChatInterface( # fn=interface, # examples=["What is the video about?", "key points of the video"], # title=f"Question Mode - {title}", # ) # ui.launch() demo.launch()