from langchain import LLMChain
from scrapegraph_py import Client
from pydantic import BaseModel, Field
from typing import Optional
class ArticleSchema(BaseModel):
"""Schema for article content"""
title: str = Field(description="Article title")
content: str = Field(description="Main article content")
author: Optional[str] = Field(description="Article author name")
date: Optional[str] = Field(description="Publication date")
summary: Optional[str] = Field(description="Article summary or description")
# Initialize the client
client = Client()
try:
# Scrape relevant content
response = client.smartscraper(
website_url="https://example.com/article",
user_prompt="Extract the main article content, title, author, and publication date",
output_schema=ArticleSchema
)
# Use in your RAG pipeline
text_content = f"Title: {response.title}\n\nContent: {response.content}"
docs = text_splitter.split_text(text_content) # Most text splitters expect string input
vectorstore.add_documents(docs)
# Query your LLM with the enhanced context
response = llm_chain.run("Summarize the latest developments...")
except Exception as e:
print(f"Error occurred: {str(e)}")