Overview
LangGraph runs on top of LangChain, so vanilla @tool-decorated wrappers around the scrapegraph-py SDK plug straight into create_react_agent, ToolNode, or any custom StateGraph node β no third-party integration package needed.
LangGraph docs Official LangGraph documentation
scrapegraph-py on PyPI The official Python SDK for ScrapeGraph v2
Installation
pip install langchain langchain-openai langgraph scrapegraph-py
Set your keys:
export SGAI_API_KEY = "your-scrapegraph-key"
export OPENAI_API_KEY = "your-openai-key"
Get your ScrapeGraph API key from the dashboard .
Save as sgai_tools.py β every example below imports from it.
from typing import Optional
from langchain_core.tools import tool
from scrapegraph_py import ScrapeGraphAI, MarkdownFormatConfig, JsonFormatConfig
sgai = ScrapeGraphAI() # reads SGAI_API_KEY from env
def _unwrap ( result ):
"""Return the SDK response payload as a plain dict."""
if result.error:
raise RuntimeError ( f "ScrapeGraph error: { result.error } " )
data = result.data
return data.model_dump() if hasattr (data, "model_dump" ) else data
@tool
def scrape ( url : str ) -> dict :
"""Fetch a web page and return its content as markdown."""
return _unwrap(sgai.scrape( url = url, formats = [MarkdownFormatConfig()]))
@tool
def extract ( url : str , prompt : str ) -> dict :
"""Extract structured data from a web page using a natural-language prompt."""
return _unwrap(sgai.extract( prompt = prompt, url = url))
@tool
def search ( query : str , num_results : int = 3 ) -> dict :
"""Run an AI web search; returns ranked results with fetched content."""
return _unwrap(sgai.search( query = query, num_results = num_results))
@tool
def crawl_start ( url : str , max_depth : int = 2 , max_pages : int = 10 ) -> dict :
"""Start a multi-page crawl job. Returns a dict including the crawl `id`."""
return _unwrap(sgai.crawl.start(
url = url, max_depth = max_depth, max_pages = max_pages,
formats = [MarkdownFormatConfig()],
))
@tool
def crawl_get ( crawl_id : str ) -> dict :
"""Fetch the status and result of a crawl job."""
return _unwrap(sgai.crawl.get(crawl_id))
@tool
def crawl_stop ( crawl_id : str ) -> dict :
"""Stop a running crawl."""
return _unwrap(sgai.crawl.stop(crawl_id))
@tool
def crawl_resume ( crawl_id : str ) -> dict :
"""Resume a stopped crawl."""
return _unwrap(sgai.crawl.resume(crawl_id))
@tool
def monitor_create ( url : str , interval : str , name : Optional[ str ] = None , prompt : Optional[ str ] = None ) -> dict :
"""Create a scheduled monitor. If `prompt` is given, each tick stores
JSON extraction; otherwise it stores markdown. `interval` is cron syntax."""
formats = [JsonFormatConfig( prompt = prompt)] if prompt else [MarkdownFormatConfig()]
return _unwrap(sgai.monitor.create( url = url, interval = interval, name = name, formats = formats))
@tool
def monitor_list () -> list :
"""List all monitors."""
return _unwrap(sgai.monitor.list())
@tool
def monitor_get ( monitor_id : str ) -> dict :
"""Get one monitor by id."""
return _unwrap(sgai.monitor.get(monitor_id))
@tool
def monitor_pause ( monitor_id : str ) -> dict :
"""Pause a monitor."""
return _unwrap(sgai.monitor.pause(monitor_id))
@tool
def monitor_resume ( monitor_id : str ) -> dict :
"""Resume a paused monitor."""
return _unwrap(sgai.monitor.resume(monitor_id))
@tool
def monitor_delete ( monitor_id : str ) -> dict :
"""Delete a monitor."""
_unwrap(sgai.monitor.delete(monitor_id))
return { "deleted" : monitor_id}
@tool
def history_list ( service : Optional[ str ] = None , page : int = 1 , limit : int = 20 ) -> dict :
"""List recent API request history."""
return _unwrap(sgai.history.list( service = service, page = page, limit = limit))
@tool
def credits () -> dict :
"""Check remaining ScrapeGraph API credits."""
return _unwrap(sgai.credits())
ALL_TOOLS = [
scrape, extract, search,
crawl_start, crawl_get, crawl_stop, crawl_resume,
monitor_create, monitor_list, monitor_get,
monitor_pause, monitor_resume, monitor_delete,
history_list, credits ,
]
ScrapeGraph endpoint SDK call Tool POST /scrapesgai.scrape(url=...)scrapePOST /extractsgai.extract(prompt=..., url=...)extractPOST /searchsgai.search(query=...)searchPOST /crawlsgai.crawl.start(url=...)crawl_startGET /crawl/{id}sgai.crawl.get(id)crawl_getPOST /crawl/{id}/stopsgai.crawl.stop(id)crawl_stopPOST /crawl/{id}/resumesgai.crawl.resume(id)crawl_resumePOST /monitorsgai.monitor.create(url=..., interval=...)monitor_createGET /monitorsgai.monitor.list()monitor_listGET /monitor/{id}sgai.monitor.get(id)monitor_getPOST /monitor/{id}/pausesgai.monitor.pause(id)monitor_pausePOST /monitor/{id}/resumesgai.monitor.resume(id)monitor_resumeDELETE /monitor/{id}sgai.monitor.delete(id)monitor_deleteGET /historysgai.history.list(...)history_listGET /creditssgai.credits()credits
Option A β prebuilt agent via create_agent
Fastest path: LangChain v1βs create_agent returns a compiled LangGraph with the standard ReAct loop baked in β one call wires up every tool behind an LLM router.
from langchain.agents import create_agent
from langchain_openai import ChatOpenAI
from sgai_tools import ALL_TOOLS
llm = ChatOpenAI( model = "gpt-4o" , temperature = 0 )
agent = create_agent(
model = llm,
tools = ALL_TOOLS ,
system_prompt = "You are a web research agent. Use ScrapeGraph tools to gather and extract web data." ,
)
final_state = agent.invoke({
"messages" : [( "user" , "Search for 'best AI scraping tools 2026' and extract the top 3 names into JSON." )],
})
print (final_state[ "messages" ][ - 1 ].content)
langgraph.prebuilt.create_react_agent still exists but is deprecated in LangGraph v1.0 β use create_agent from langchain.agents.
Use this when you need custom routing, streaming, interrupts, or checkpointing.
from typing import Annotated, TypedDict
from langchain_core.messages import BaseMessage
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, START
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from sgai_tools import scrape, extract, search, crawl_start, crawl_get, credits
class State ( TypedDict ):
messages: Annotated[list[BaseMessage], add_messages]
tools = [scrape, extract, search, crawl_start, crawl_get, credits ]
llm = ChatOpenAI( model = "gpt-4o" , temperature = 0 ).bind_tools(tools)
def call_model ( state : State):
return { "messages" : [llm.invoke(state[ "messages" ])]}
graph = StateGraph(State)
graph.add_node( "agent" , call_model)
graph.add_node( "tools" , ToolNode(tools))
graph.add_edge( START , "agent" )
graph.add_conditional_edges( "agent" , tools_condition)
graph.add_edge( "tools" , "agent" )
app = graph.compile()
out = app.invoke({
"messages" : [( "user" , "Extract the top stories from https://news.ycombinator.com" )],
})
print (out[ "messages" ][ - 1 ].content)
Option C β deterministic pipeline
When the sequence is known in advance β e.g. search β pick URL β extract β skip the agent loop and call tools directly from nodes. No LLM routing, fully reproducible.
from typing import TypedDict
from langgraph.graph import StateGraph, START , END
from sgai_tools import search, extract
class State ( TypedDict ):
query: str
top_url: str
data: dict
def do_search ( state : State):
hits = search.invoke({ "query" : state[ "query" ], "num_results" : 1 })
return { "top_url" : hits[ "results" ][ 0 ][ "url" ]}
def do_extract ( state : State):
return { "data" : extract.invoke({
"url" : state[ "top_url" ],
"prompt" : "Extract the product name and price as JSON" ,
})}
g = StateGraph(State)
g.add_node( "search" , do_search)
g.add_node( "extract" , do_extract)
g.add_edge( START , "search" )
g.add_edge( "search" , "extract" )
g.add_edge( "extract" , END )
pipeline = g.compile()
print (pipeline.invoke({ "query" : "iPhone 15 Pro price apple.com" }))
Crawl as a background node
Crawls are async. Wrap start + poll in a single node so the graph advances only when the job completes.
import time
from typing import TypedDict
from langgraph.graph import StateGraph, START , END
from sgai_tools import crawl_start, crawl_get
class CrawlState ( TypedDict ):
url: str
crawl_id: str
result: dict
def run_crawl ( state : CrawlState):
job = crawl_start.invoke({ "url" : state[ "url" ], "max_depth" : 2 , "max_pages" : 10 })
crawl_id = job[ "id" ]
while True :
info = crawl_get.invoke({ "crawl_id" : crawl_id})
if info[ "status" ] in ( "completed" , "failed" ):
return { "crawl_id" : crawl_id, "result" : info}
time.sleep( 5 )
g = StateGraph(CrawlState)
g.add_node( "crawl" , run_crawl)
g.add_edge( START , "crawl" )
g.add_edge( "crawl" , END )
app = g.compile()
print (app.invoke({ "url" : "https://scrapegraphai.com" }))
Choosing a pattern
Pattern Use when Option A β ReAct agentOpen-ended tasks; the model decides which endpoint to call Option B β StateGraph + ToolNodeYou need checkpointing, streaming, human-in-the-loop, or custom routing Option C β deterministic pipelineSteps and order are fixed; no need for LLM decision-making
Support
Python SDK Source and issues for scrapegraph-py
Discord Get help from our community