Learn how to build a RAG (Retrieval Augmented Generation) chatbot that can answer questions about any webpage by combining ScrapeGraphβs Markdownify service with LanceDB vector store and OpenAI.
Try it yourself in our interactive notebooks:
The Goal
Weβll create a chatbot that can:
Feature | Description |
---|
Webpage Ingestion | Convert any webpage to markdown format |
Content Chunking | Split content into manageable chunks |
Vector Storage | Store and index chunks in LanceDB |
Question Answering | Answer questions using relevant chunks |
Code Example
from burr.core import action, State, ApplicationBuilder
from scrapegraph_py import Client
import lancedb
from lancedb.pydantic import LanceModel, Vector
import openai
import tiktoken
from typing import List, Optional
# Schema for storing text chunks
class TextDocument(LanceModel):
url: str
position: int
text: str
vector: Vector(dim=1536) # OpenAI embedding dimensions
# Action to fetch and convert webpage to markdown
@action(reads=[], writes=["markdown_content"])
def fetch_webpage(state: State, webpage_url: str) -> State:
client = Client()
response = client.markdownify(website_url=webpage_url)
return state.update(markdown_content=response["result"])
# Action to embed and store chunks
@action(reads=["markdown_content"], writes=[])
def embed_and_store(state: State, webpage_url: str) -> State:
chunks = get_text_chunks(state["markdown_content"])
con = lancedb.connect("./webpages")
table = con.create_table("chunks", schema=TextDocument)
table.add([{
"text": chunk,
"url": webpage_url,
"position": i
} for i, chunk in enumerate(chunks)])
return state
# Action to answer questions
@action(reads=[], writes=["llm_answer"])
def ask_question(state: State, user_query: str) -> State:
chunks_table = lancedb.connect("./webpages").open_table("chunks")
relevant_chunks = chunks_table.search(user_query).limit(3).to_list()
response = openai.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": f"Answer based on: {relevant_chunks}"},
{"role": "user", "content": user_query}
]
)
return state.update(llm_answer=response.choices[0].message.content)
Example Output
{
"question": "Who are the founders of ScrapeGraphAI?",
"answer": "The founders of ScrapeGraphAI are:\n\n1. Marco Perini - Founder & Technical Lead\n2. Marco Vinciguerra - Founder & Software Engineer\n3. Lorenzo Padoan - Founder & Product Engineer"
}
Have a suggestion for a new example? Contact us with your use case or contribute directly on GitHub.