from pydantic import BaseModel, Field
from typing import List, Optional
from datetime import datetime, timedelta
from scrapegraph_py import Client
import time # For job polling
# Schema for blog post data
class BlogPost(BaseModel):
title: str = Field(description="Blog post title")
content: str = Field(description="Blog post content")
excerpt: Optional[str] = Field(description="Post excerpt or summary")
author: str = Field(description="Post author")
publication_date: str = Field(description="Publication date")
url: str = Field(description="Post URL")
categories: Optional[List[str]] = Field(description="Post categories")
tags: Optional[List[str]] = Field(description="Post tags")
comments_count: Optional[int] = Field(description="Number of comments")
reading_time: Optional[int] = Field(description="Estimated reading time in minutes")
# Schema for blog monitoring results
class BlogMonitorResult(BaseModel):
posts: List[BlogPost] = Field(description="List of blog posts")
total_posts: int = Field(description="Total number of posts found")
blog_url: str = Field(description="Blog homepage URL")
last_updated: str = Field(description="Last monitoring timestamp")
client = Client(api_key="your-api-key")
# Start the crawler job
job = client.crawl.start(
url="https://example-blog.com",
depth=2, # Crawl up to 2 levels deep
include_patterns=["/blog/*"],
exclude_patterns=["/tag/*", "/author/*"]
)
# Wait for job completion and get results
job_id = job["id"]
while True:
status = client.crawl.status(job_id)
if status.get("status") == "completed":
response = status.get("data", {})
break
elif status.get("status") in ["failed", "cancelled", "error"]:
print(f"Job failed: {status.get('error')}")
break
time.sleep(5) # Wait 5 seconds before checking again
# Process the crawled content if successful
if response and response.get("pages"):
print(f"Total Pages Found: {len(response['pages'])}")
for post in response["pages"]:
# Check if post is recent
post_date = datetime.strptime(post["publication_date"], "%Y-%m-%d")
if post_date > datetime.now() - timedelta(days=7):
print(f"Title: {post['title']}")
print(f"Author: {post.get('author', 'Unknown')}")
print(f"Published: {post['publication_date']}")
print(f"Reading Time: {post.get('reading_time', 'N/A')} minutes")
if post.get("categories"):
print(f"Categories: {', '.join(post['categories'])}")
if post.get("tags"):
print(f"Tags: {', '.join(post['tags'])}")
if post.get("excerpt"):
print(f"\nExcerpt: {post['excerpt']}")
print(f"URL: {post['url']}\n")