class LangChainChunker: def __init__(self, text): self.text = text def chunker(self, size=1000): from langchain.text_splitter import CharacterTextSplitter # attach the duration of the video to the chunk # [[chunk, duration]] text_splitter = CharacterTextSplitter( separator=" ", chunk_size=size, chunk_overlap=0.9, ) return text_splitter.split_text(self.text) def __sizeof__(self) -> int: count = 0 for _ in self.text: count += 1 return count def getSubsText(video_id="", getGenerated=False): from youtube_transcript_api import YouTubeTranscriptApi as ytapi from youtube_transcript_api.formatters import TextFormatter tList = ytapi.list_transcripts(video_id) data = "" if getGenerated: # TODO: implement getGenerated pass for t in tList: data = t.fetch() return (TextFormatter().format_transcript(data)).replace("\n", " ")