Skip to main content

Overview

LangGraph runs on top of LangChain, so vanilla @tool-decorated wrappers around the scrapegraph-py SDK plug straight into create_react_agent, ToolNode, or any custom StateGraph node β€” no third-party integration package needed.

LangGraph docs

Official LangGraph documentation

scrapegraph-py on PyPI

The official Python SDK for ScrapeGraph v2

Installation

pip install langchain langchain-openai langgraph scrapegraph-py
Set your keys:
export SGAI_API_KEY="your-scrapegraph-key"
export OPENAI_API_KEY="your-openai-key"
Get your ScrapeGraph API key from the dashboard.

Build the toolkit

Save as sgai_tools.py β€” every example below imports from it.
sgai_tools.py
from typing import Optional
from langchain_core.tools import tool
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig, JsonFormatConfig

sgai = ScrapeGraphAI()  # reads SGAI_API_KEY from env

def _unwrap(result):
    """Return the SDK response payload as a plain dict."""
    if result.error:
        raise RuntimeError(f"ScrapeGraph error: {result.error}")
    data = result.data
    return data.model_dump() if hasattr(data, "model_dump") else data

@tool
def scrape(url: str) -> dict:
    """Fetch a web page and return its content as markdown."""
    return _unwrap(sgai.scrape(url=url, formats=[MarkdownFormatConfig()]))

@tool
def extract(url: str, prompt: str) -> dict:
    """Extract structured data from a web page using a natural-language prompt."""
    return _unwrap(sgai.extract(prompt=prompt, url=url))

@tool
def search(query: str, num_results: int = 3) -> dict:
    """Run an AI web search; returns ranked results with fetched content."""
    return _unwrap(sgai.search(query=query, num_results=num_results))

@tool
def crawl_start(url: str, max_depth: int = 2, max_pages: int = 10) -> dict:
    """Start a multi-page crawl job. Returns a dict including the crawl `id`."""
    return _unwrap(sgai.crawl.start(
        url=url, max_depth=max_depth, max_pages=max_pages,
        formats=[MarkdownFormatConfig()],
    ))

@tool
def crawl_get(crawl_id: str) -> dict:
    """Fetch the status and result of a crawl job."""
    return _unwrap(sgai.crawl.get(crawl_id))

@tool
def crawl_stop(crawl_id: str) -> dict:
    """Stop a running crawl."""
    return _unwrap(sgai.crawl.stop(crawl_id))

@tool
def crawl_resume(crawl_id: str) -> dict:
    """Resume a stopped crawl."""
    return _unwrap(sgai.crawl.resume(crawl_id))

@tool
def monitor_create(url: str, interval: str, name: Optional[str] = None, prompt: Optional[str] = None) -> dict:
    """Create a scheduled monitor. If `prompt` is given, each tick stores
    JSON extraction; otherwise it stores markdown. `interval` is cron syntax."""
    formats = [JsonFormatConfig(prompt=prompt)] if prompt else [MarkdownFormatConfig()]
    return _unwrap(sgai.monitor.create(url=url, interval=interval, name=name, formats=formats))

@tool
def monitor_list() -> list:
    """List all monitors."""
    return _unwrap(sgai.monitor.list())

@tool
def monitor_get(monitor_id: str) -> dict:
    """Get one monitor by id."""
    return _unwrap(sgai.monitor.get(monitor_id))

@tool
def monitor_pause(monitor_id: str) -> dict:
    """Pause a monitor."""
    return _unwrap(sgai.monitor.pause(monitor_id))

@tool
def monitor_resume(monitor_id: str) -> dict:
    """Resume a paused monitor."""
    return _unwrap(sgai.monitor.resume(monitor_id))

@tool
def monitor_delete(monitor_id: str) -> dict:
    """Delete a monitor."""
    _unwrap(sgai.monitor.delete(monitor_id))
    return {"deleted": monitor_id}

@tool
def history_list(service: Optional[str] = None, page: int = 1, limit: int = 20) -> dict:
    """List recent API request history."""
    return _unwrap(sgai.history.list(service=service, page=page, limit=limit))

@tool
def credits() -> dict:
    """Check remaining ScrapeGraph API credits."""
    return _unwrap(sgai.credits())

ALL_TOOLS = [
    scrape, extract, search,
    crawl_start, crawl_get, crawl_stop, crawl_resume,
    monitor_create, monitor_list, monitor_get,
    monitor_pause, monitor_resume, monitor_delete,
    history_list, credits,
]

Endpoint β†’ tool reference

ScrapeGraph endpointSDK callTool
POST /scrapesgai.scrape(url=...)scrape
POST /extractsgai.extract(prompt=..., url=...)extract
POST /searchsgai.search(query=...)search
POST /crawlsgai.crawl.start(url=...)crawl_start
GET /crawl/{id}sgai.crawl.get(id)crawl_get
POST /crawl/{id}/stopsgai.crawl.stop(id)crawl_stop
POST /crawl/{id}/resumesgai.crawl.resume(id)crawl_resume
POST /monitorsgai.monitor.create(url=..., interval=...)monitor_create
GET /monitorsgai.monitor.list()monitor_list
GET /monitor/{id}sgai.monitor.get(id)monitor_get
POST /monitor/{id}/pausesgai.monitor.pause(id)monitor_pause
POST /monitor/{id}/resumesgai.monitor.resume(id)monitor_resume
DELETE /monitor/{id}sgai.monitor.delete(id)monitor_delete
GET /historysgai.history.list(...)history_list
GET /creditssgai.credits()credits

Option A β€” prebuilt agent via create_agent

Fastest path: LangChain v1’s create_agent returns a compiled LangGraph with the standard ReAct loop baked in β€” one call wires up every tool behind an LLM router.
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from sgai_tools import ALL_TOOLS

llm = ChatOpenAI(model="gpt-4o", temperature=0)
agent = create_agent(
    model=llm,
    tools=ALL_TOOLS,
    system_prompt="You are a web research agent. Use ScrapeGraph tools to gather and extract web data.",
)

final_state = agent.invoke({
    "messages": [("user", "Search for 'best AI scraping tools 2026' and extract the top 3 names into JSON.")],
})
print(final_state["messages"][-1].content)
langgraph.prebuilt.create_react_agent still exists but is deprecated in LangGraph v1.0 β€” use create_agent from langchain.agents.

Option B β€” custom StateGraph with ToolNode

Use this when you need custom routing, streaming, interrupts, or checkpointing.
from typing import Annotated, TypedDict
from langchain_core.messages import BaseMessage
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, START
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from sgai_tools import scrape, extract, search, crawl_start, crawl_get, credits

class State(TypedDict):
    messages: Annotated[list[BaseMessage], add_messages]

tools = [scrape, extract, search, crawl_start, crawl_get, credits]
llm = ChatOpenAI(model="gpt-4o", temperature=0).bind_tools(tools)

def call_model(state: State):
    return {"messages": [llm.invoke(state["messages"])]}

graph = StateGraph(State)
graph.add_node("agent", call_model)
graph.add_node("tools", ToolNode(tools))
graph.add_edge(START, "agent")
graph.add_conditional_edges("agent", tools_condition)
graph.add_edge("tools", "agent")
app = graph.compile()

out = app.invoke({
    "messages": [("user", "Extract the top stories from https://news.ycombinator.com")],
})
print(out["messages"][-1].content)

Option C β€” deterministic pipeline

When the sequence is known in advance β€” e.g. search β†’ pick URL β†’ extract β€” skip the agent loop and call tools directly from nodes. No LLM routing, fully reproducible.
from typing import TypedDict
from langgraph.graph import StateGraph, START, END
from sgai_tools import search, extract

class State(TypedDict):
    query: str
    top_url: str
    data: dict

def do_search(state: State):
    hits = search.invoke({"query": state["query"], "num_results": 1})
    return {"top_url": hits["results"][0]["url"]}

def do_extract(state: State):
    return {"data": extract.invoke({
        "url": state["top_url"],
        "prompt": "Extract the product name and price as JSON",
    })}

g = StateGraph(State)
g.add_node("search", do_search)
g.add_node("extract", do_extract)
g.add_edge(START, "search")
g.add_edge("search", "extract")
g.add_edge("extract", END)
pipeline = g.compile()

print(pipeline.invoke({"query": "iPhone 15 Pro price apple.com"}))

Crawl as a background node

Crawls are async. Wrap start + poll in a single node so the graph advances only when the job completes.
import time
from typing import TypedDict
from langgraph.graph import StateGraph, START, END
from sgai_tools import crawl_start, crawl_get

class CrawlState(TypedDict):
    url: str
    crawl_id: str
    result: dict

def run_crawl(state: CrawlState):
    job = crawl_start.invoke({"url": state["url"], "max_depth": 2, "max_pages": 10})
    crawl_id = job["id"]
    while True:
        info = crawl_get.invoke({"crawl_id": crawl_id})
        if info["status"] in ("completed", "failed"):
            return {"crawl_id": crawl_id, "result": info}
        time.sleep(5)

g = StateGraph(CrawlState)
g.add_node("crawl", run_crawl)
g.add_edge(START, "crawl")
g.add_edge("crawl", END)
app = g.compile()

print(app.invoke({"url": "https://scrapegraphai.com"}))

Choosing a pattern

PatternUse when
Option A β€” ReAct agentOpen-ended tasks; the model decides which endpoint to call
Option B β€” StateGraph + ToolNodeYou need checkpointing, streaming, human-in-the-loop, or custom routing
Option C β€” deterministic pipelineSteps and order are fixed; no need for LLM decision-making

Support

Python SDK

Source and issues for scrapegraph-py

Discord

Get help from our community