Langchain
Pre-requirements
requirements.txt
scikit-learn
pytorch
torchvision
torchaudio
huggingface
transformers
langchain
InstructorEmbedding
sentence_transformers
chardet
charset-normalizer==3.1.0
youtube-transcript-api
faiss-gpu
Documents Loaders
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
video_url = "https://www.youtube.com/watch?v=Jr8gLJr9WKQ&ab_channel=EnglishSkillsMastery"
chunk_size = 1000
chunk_overlap = 100
loader = YoutubeLoader.from_youtube_url(video_url)
transcript = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
docs = text_splitter.split_documents(transcript)
docs
# [Document(page_content="English by mimicking...
Embedding
from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
embeding_model_name = "hkunlp/instructor-large"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceInstructEmbeddings(
model_name=embeding_model_name,
model_kwargs=model_kwargs,
encode_kwargs=encode_kwargs
)
text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result
#[0.01804392784833908,
# -0.0509769469499588,
# -0.0229482538998127,
# 0.0186158400028944,
# 0.024165937677025795,
# ...
Vectorstores
k = 5
question = "What is teacher's name?"
db = FAISS.from_documents(docs, embeddings)
docs_page_content = " ".join([d.page_content for d in docs])
docs_page_content
# "nervous on my very first day I felt excited but also nervous it was a struggle...
LLM
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFaceHub, OpenAI
llm_repo_id = "google/flan-t5-large"
# "openchat/openchat-3.5-1210"
llm = HuggingFaceHub(
repo_id=llm_repo_id, model_kwargs={"temperature": 0.05, "max_length": 512}
)
prompt = PromptTemplate(
input_variables=["question", "docs"],
template="""
You are a helpful Youtube assistanct that can answer questions about videos based on the video's transcript
Answer the following question: {question}
By searching the following video transcript: {docs}
Only use the factual information from the transcipt to answer the question.
If you feel like you do not have enough information to answer the question, say "I don't know".
Your answers should be detailed.
""")
chain = LLMChain(llm=llm, prompt=prompt)
response = chain.run(question=question, docs=docs_page_content)
response
# Henry
Last updated