Langchain

Pre-requirements

requirements.txt
scikit-learn
pytorch 
torchvision 
torchaudio

huggingface
transformers
langchain
InstructorEmbedding
sentence_transformers

chardet
charset-normalizer==3.1.0
youtube-transcript-api
faiss-gpu

Documents Loaders

from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

video_url = "https://www.youtube.com/watch?v=Jr8gLJr9WKQ&ab_channel=EnglishSkillsMastery"
chunk_size = 1000
chunk_overlap = 100

loader = YoutubeLoader.from_youtube_url(video_url)
transcript = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap
    )
docs = text_splitter.split_documents(transcript)
docs

# [Document(page_content="English by mimicking...

Embedding

from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings

embeding_model_name = "hkunlp/instructor-large"

model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceInstructEmbeddings(
    model_name=embeding_model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result

#[0.01804392784833908,
# -0.0509769469499588,
# -0.0229482538998127,
# 0.0186158400028944,
# 0.024165937677025795,
# ...

Vectorstores

k = 5
question = "What is teacher's name?"

db = FAISS.from_documents(docs, embeddings)
docs_page_content = " ".join([d.page_content for d in docs])
docs_page_content

# "nervous on my very first day I felt excited but also nervous it was a struggle...

LLM

from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import HuggingFaceHub, OpenAI

llm_repo_id = "google/flan-t5-large"
# "openchat/openchat-3.5-1210"

llm = HuggingFaceHub(
    repo_id=llm_repo_id, model_kwargs={"temperature": 0.05, "max_length": 512}
)
prompt = PromptTemplate(
    input_variables=["question", "docs"],
    template="""
You are a helpful Youtube assistanct that can answer questions about videos based on the video's transcript
        
Answer the following question: {question}
By searching the following video transcript: {docs}

Only use the factual information from the transcipt to answer the question.

If you feel like you do not have enough information to answer the question, say "I don't know".

Your answers should be detailed.
""")
chain = LLMChain(llm=llm, prompt=prompt)
response = chain.run(question=question, docs=docs_page_content)
response

# Henry

Last updated