Learn how to build a RAG (Retrieval Augmented Generation) chatbot that can answer questions about any webpage by combining ScrapeGraph’s Markdownify service with LanceDB vector store and OpenAI.

Try it yourself in our interactive notebooks:

The Goal

We’ll create a chatbot that can:

FeatureDescription
Webpage IngestionConvert any webpage to markdown format
Content ChunkingSplit content into manageable chunks
Vector StorageStore and index chunks in LanceDB
Question AnsweringAnswer questions using relevant chunks

Code Example

from burr.core import action, State, ApplicationBuilder
from scrapegraph_py import Client
import lancedb
from lancedb.pydantic import LanceModel, Vector
import openai
import tiktoken
from typing import List, Optional

# Schema for storing text chunks
class TextDocument(LanceModel):
    url: str
    position: int
    text: str
    vector: Vector(dim=1536)  # OpenAI embedding dimensions

# Action to fetch and convert webpage to markdown
@action(reads=[], writes=["markdown_content"])
def fetch_webpage(state: State, webpage_url: str) -> State:
    client = Client()
    response = client.markdownify(website_url=webpage_url)
    return state.update(markdown_content=response["result"])

# Action to embed and store chunks
@action(reads=["markdown_content"], writes=[])
def embed_and_store(state: State, webpage_url: str) -> State:
    chunks = get_text_chunks(state["markdown_content"])
    con = lancedb.connect("./webpages")
    table = con.create_table("chunks", schema=TextDocument)
    table.add([{
        "text": chunk,
        "url": webpage_url,
        "position": i
    } for i, chunk in enumerate(chunks)])
    return state

# Action to answer questions
@action(reads=[], writes=["llm_answer"])
def ask_question(state: State, user_query: str) -> State:
    chunks_table = lancedb.connect("./webpages").open_table("chunks")
    relevant_chunks = chunks_table.search(user_query).limit(3).to_list()
    
    response = openai.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": f"Answer based on: {relevant_chunks}"},
            {"role": "user", "content": user_query}
        ]
    )
    return state.update(llm_answer=response.choices[0].message.content)

Example Output

{
    "question": "Who are the founders of ScrapeGraphAI?",
    "answer": "The founders of ScrapeGraphAI are:\n\n1. Marco Perini - Founder & Technical Lead\n2. Marco Vinciguerra - Founder & Software Engineer\n3. Lorenzo Padoan - Founder & Product Engineer"
}

Have a suggestion for a new example? Contact us with your use case or contribute directly on GitHub.