Web Scraping Guide¶

Comprehensive guide to web scraping with Scrap-E's HTTP and browser-based scrapers.

Overview¶

Scrap-E provides two main approaches to web scraping:

HttpScraper: Fast, lightweight scraping using HTTP requests
BrowserScraper: Full browser automation for JavaScript-heavy sites

Choose based on your target website's complexity and requirements.

HTTP Scraping¶

Basic HTTP Scraping¶

The HttpScraper is ideal for static websites and APIs:

import asyncio
from scrap_e.scrapers.web.http_scraper import HttpScraper

async def basic_scraping():
    scraper = HttpScraper()

    result = await scraper.scrape("https://httpbin.org/json")

    if result.success:
        print(f"Status: {result.data.status_code}")
        print(f"Content: {result.data.content}")
        print(f"Headers: {result.data.headers}")

    await scraper._cleanup()

asyncio.run(basic_scraping())

Data Extraction with Rules¶

Extract specific data using CSS selectors, XPath, or regular expressions:

from scrap_e.core.models import ExtractionRule
from scrap_e.scrapers.web.http_scraper import HttpScraper

async def extract_with_rules():
    scraper = HttpScraper()

    # Add extraction rules
    scraper.extraction_rules = [
        ExtractionRule(
            name="title",
            selector="h1.main-title",
            required=True
        ),
        ExtractionRule(
            name="articles",
            selector="article.post",
            multiple=True,
            attribute="data-id"
        ),
        ExtractionRule(
            name="email",
            regex=r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
            multiple=True
        )
    ]

    result = await scraper.scrape("https://example-blog.com")

    if result.success and result.data.extracted_data:
        data = result.data.extracted_data
        print(f"Title: {data.get('title')}")
        print(f"Articles: {len(data.get('articles', []))}")
        print(f"Emails found: {data.get('email', [])}")

    await scraper._cleanup()

asyncio.run(extract_with_rules())

Session Management and Cookies¶

Maintain session state across requests:

async def session_scraping():
    scraper = HttpScraper()

    urls = [
        "https://httpbin.org/cookies/set/session_id/abc123",
        "https://httpbin.org/cookies",
        "https://httpbin.org/headers"
    ]

    # Scrape with session persistence
    results = await scraper.scrape_with_session(
        urls,
        initial_cookies={"user_pref": "dark_mode"}
    )

    for i, result in enumerate(results):
        if result.success:
            print(f"Request {i+1}: {result.data.url}")

asyncio.run(session_scraping())

Custom Headers and Authentication¶

Handle authentication and custom headers:

async def authenticated_scraping():
    scraper = HttpScraper()

    # Custom headers
    headers = {
        "Authorization": "Bearer your-token-here",
        "User-Agent": "MyBot 1.0",
        "Accept": "application/json"
    }

    # Scrape with authentication
    result = await scraper.scrape(
        "https://api.example.com/protected",
        headers=headers,
        method="POST",
        json={"query": "search term"}
    )

    if result.success:
        print(f"Protected data: {result.data.content}")

asyncio.run(authenticated_scraping())

Handling Forms and POST Requests¶

Submit forms and handle POST requests:

async def form_submission():
    scraper = HttpScraper()

    # Login form submission
    login_data = {
        "username": "user@example.com",
        "password": "secure_password",
        "remember_me": "1"
    }

    result = await scraper.scrape(
        "https://example.com/login",
        method="POST",
        data=login_data
    )

    if result.success and result.data.status_code == 200:
        print("Login successful!")

        # Use cookies from login for subsequent requests
        cookies = {}
        set_cookie = result.data.headers.get("set-cookie", "")
        for cookie in set_cookie.split(","):
            if "=" in cookie:
                key, value = cookie.split("=", 1)
                cookies[key.strip()] = value.split(";")[0].strip()

        # Access protected page
        protected_result = await scraper.scrape(
            "https://example.com/dashboard",
            cookies=cookies
        )

asyncio.run(form_submission())

Browser Scraping¶

Basic Browser Automation¶

Use BrowserScraper for JavaScript-heavy sites:

from scrap_e.scrapers.web import BrowserScraper

async def browser_scraping():
    scraper = BrowserScraper()

    result = await scraper.scrape(
        "https://spa-example.com",
        wait_for_selector=".dynamic-content",
        screenshot=True
    )

    if result.success:
        print(f"Title: {result.data.title}")
        print(f"Content loaded: {len(result.data.content)}")

        # Save screenshot
        if result.data.screenshot:
            with open("page_screenshot.png", "wb") as f:
                f.write(result.data.screenshot)

asyncio.run(browser_scraping())

Page Interactions¶

Interact with page elements:

async def interactive_scraping():
    scraper = BrowserScraper()

    interactions = [
        {"action": "fill", "selector": "#search-input", "value": "Python scraping"},
        {"action": "click", "selector": "#search-button"},
        {"action": "wait", "time": 2},
        {"action": "click", "selector": ".result-item:first-child"}
    ]

    result = await scraper.interact_and_scrape(
        "https://search-example.com",
        interactions=interactions
    )

    if result.success:
        print(f"Final page: {result.data.url}")

asyncio.run(interactive_scraping())

Infinite Scroll Pages¶

Handle infinite scroll and dynamic loading:

async def infinite_scroll_scraping():
    scraper = BrowserScraper()

    result = await scraper.scrape_infinite_scroll(
        "https://infinite-scroll-example.com",
        max_scrolls=5,
        wait_between_scrolls=2
    )

    if result.success:
        print(f"Final content length: {len(result.data.content)}")

asyncio.run(infinite_scroll_scraping())

Single Page Applications (SPAs)¶

Navigate through SPA routes:

async def spa_scraping():
    scraper = BrowserScraper()

    routes = ["#/home", "#/products", "#/about", "#/contact"]

    results = await scraper.scrape_spa(
        "https://spa-example.com",
        routes=routes,
        wait_after_navigation=1
    )

    for i, result in enumerate(results):
        print(f"Route {i}: {result.url} - Title: {result.title}")

asyncio.run(spa_scraping())

Advanced Techniques¶

Concurrent Scraping¶

Scrape multiple URLs concurrently:

async def concurrent_scraping():
    scraper = HttpScraper()

    urls = [
        "https://httpbin.org/delay/1",
        "https://httpbin.org/delay/2",
        "https://httpbin.org/delay/1",
        "https://httpbin.org/json",
        "https://httpbin.org/headers"
    ]

    # Concurrent scraping with session management
    async with scraper.session() as s:
        results = await s.scrape_multiple(
            urls,
            max_concurrent=3
        )

    successful = [r for r in results if r.success]
    failed = [r for r in results if not r.success]

    print(f"Successful: {len(successful)}")
    print(f"Failed: {len(failed)}")

asyncio.run(concurrent_scraping())

Pagination Handling¶

Automatically follow pagination:

from scrap_e.core.config import WebScraperConfig
from scrap_e.core.models import PaginationConfig

async def paginated_scraping():
    config = WebScraperConfig(
        pagination=PaginationConfig(
            enabled=True,
            max_pages=5,
            next_page_selector="a.next-page",
            stop_condition="No more results"
        )
    )

    scraper = HttpScraper(config)

    # Extract data from each page
    scraper.add_extraction_rule(ExtractionRule(
        name="items",
        selector=".item",
        multiple=True
    ))

    results = await scraper.scrape_paginated("https://example.com/page1")

    all_items = []
    for result in results:
        if result.success and result.data.extracted_data:
            all_items.extend(result.data.extracted_data.get("items", []))

    print(f"Total items collected: {len(all_items)}")

asyncio.run(paginated_scraping())

Sitemap Processing¶

Extract and scrape URLs from sitemaps:

async def sitemap_scraping():
    scraper = HttpScraper()

    # Extract URLs from sitemap
    urls = await scraper.scrape_sitemap("https://example.com/sitemap.xml")
    print(f"Found {len(urls)} URLs in sitemap")

    # Scrape first 10 URLs
    sample_urls = urls[:10]
    results = await scraper.scrape_multiple(sample_urls)

    for result in results:
        if result.success:
            print(f"✓ {result.data.url}")
        else:
            print(f"✗ {result.error}")

asyncio.run(sitemap_scraping())

Stream Scraping¶

Process large responses in chunks:

async def stream_scraping():
    scraper = HttpScraper()

    async for chunk in scraper.scrape_stream(
        "https://large-file-example.com/data.html",
        chunk_size=1024
    ):
        # Process each chunk as it arrives
        print(f"Processed chunk: {len(chunk.content)} bytes")

asyncio.run(stream_scraping())

Error Handling and Resilience¶

Retry Logic and Error Recovery¶

from scrap_e.core.config import WebScraperConfig
from scrap_e.core.models import RetryConfig
from scrap_e.core.exceptions import ScraperError, ConnectionError

async def resilient_scraping():
    config = WebScraperConfig(
        retry=RetryConfig(
            enabled=True,
            max_attempts=3,
            initial_delay=1.0,
            exponential_base=2.0
        )
    )

    scraper = HttpScraper(config)

    try:
        result = await scraper.scrape("https://unreliable-site.com")

        if result.success:
            print("Scraping successful after retries")
        else:
            print(f"Scraping failed: {result.error}")

    except ConnectionError as e:
        print(f"Connection error: {e}")
    except ScraperError as e:
        print(f"Scraper error: {e}")

asyncio.run(resilient_scraping())

Rate Limiting and Politeness¶

from scrap_e.core.models import RateLimitConfig

async def polite_scraping():
    config = WebScraperConfig(
        rate_limit=RateLimitConfig(
            enabled=True,
            requests_per_second=2.0,
            burst_size=5
        )
    )

    scraper = HttpScraper(config)

    urls = [f"https://example.com/page{i}" for i in range(10)]

    # Rate limiting is applied automatically
    results = await scraper.scrape_multiple(urls)

    print(f"Scraped {len(results)} URLs with rate limiting")

asyncio.run(polite_scraping())

Performance Optimization¶

Caching¶

Enable response caching:

from scrap_e.core.models import CacheConfig

async def cached_scraping():
    config = WebScraperConfig(
        cache=CacheConfig(
            enabled=True,
            backend="memory",
            ttl_seconds=300,
            max_size_mb=100
        )
    )

    scraper = HttpScraper(config)

    # First request - fetched from server
    result1 = await scraper.scrape("https://example.com")
    print(f"First request: {result1.metadata.duration_seconds:.2f}s")

    # Second request - served from cache (much faster)
    result2 = await scraper.scrape("https://example.com")
    print(f"Cached request: {result2.metadata.duration_seconds:.2f}s")

asyncio.run(cached_scraping())

Connection Pooling¶

Optimize for high-volume scraping:

async def high_volume_scraping():
    config = WebScraperConfig(
        concurrent_requests=20,
        default_timeout=15.0,
        max_workers=10
    )

    scraper = HttpScraper(config)

    # Generate many URLs
    urls = [f"https://httpbin.org/delay/{i%3}" for i in range(100)]

    import time
    start_time = time.time()

    results = await scraper.scrape_multiple(urls)

    duration = time.time() - start_time
    successful = sum(1 for r in results if r.success)

    print(f"Scraped {successful}/{len(urls)} URLs in {duration:.2f}s")
    print(f"Rate: {successful/duration:.1f} URLs/second")

asyncio.run(high_volume_scraping())

Best Practices¶

1. Choose the Right Scraper¶

HttpScraper: Static sites, APIs, fast scraping
BrowserScraper: SPAs, JavaScript-heavy sites, complex interactions

2. Respect Robots.txt¶

async def check_robots():
    scraper = HttpScraper()

    # Check robots.txt before scraping
    robots_result = await scraper.scrape("https://example.com/robots.txt")

    if robots_result.success:
        robots_content = robots_result.data.content
        if "Disallow: /" in robots_content:
            print("Site disallows scraping")
            return

    # Proceed with scraping
    result = await scraper.scrape("https://example.com")

asyncio.run(check_robots())

3. Use Appropriate User Agents¶

config = WebScraperConfig(
    user_agent="MyBot 1.0 (+https://mysite.com/bot)"
)

4. Handle Errors Gracefully¶

async def safe_scraping():
    scraper = HttpScraper()

    urls = ["https://example.com", "https://invalid-url.xyz"]

    for url in urls:
        try:
            result = await scraper.scrape(url)
            if result.success:
                print(f"✓ {url}")
            else:
                print(f"✗ {url}: {result.error}")
        except Exception as e:
            print(f"✗ {url}: Unexpected error: {e}")

asyncio.run(safe_scraping())

5. Monitor Performance¶

async def monitored_scraping():
    scraper = HttpScraper()

    # Scrape some URLs
    urls = [f"https://httpbin.org/delay/{i}" for i in range(5)]
    await scraper.scrape_multiple(urls)

    # Check statistics
    stats = scraper.get_stats()
    print(f"Total requests: {stats.total_requests}")
    print(f"Successful: {stats.successful_requests}")
    print(f"Failed: {stats.failed_requests}")
    print(f"Average response time: {stats.average_response_time:.2f}s")

asyncio.run(monitored_scraping())

Common Patterns¶

News Article Scraping¶

async def scrape_news():
    scraper = HttpScraper()

    scraper.add_extraction_rule(ExtractionRule(
        name="headline",
        selector="h1.headline, h1.title",
        required=True
    ))

    scraper.add_extraction_rule(ExtractionRule(
        name="author",
        selector=".author, .byline",
    ))

    scraper.add_extraction_rule(ExtractionRule(
        name="publish_date",
        selector="time[datetime], .publish-date",
        attribute="datetime"
    ))

    scraper.add_extraction_rule(ExtractionRule(
        name="content",
        selector=".article-body, .content",
        multiple=True
    ))

    result = await scraper.scrape("https://news-site.com/article")

    if result.success:
        article = result.data.extracted_data
        print(f"Title: {article['headline']}")
        print(f"Author: {article['author']}")
        print(f"Date: {article['publish_date']}")
        print(f"Paragraphs: {len(article['content'])}")

asyncio.run(scrape_news())

E-commerce Product Data¶

async def scrape_products():
    scraper = BrowserScraper()  # E-commerce sites often use JavaScript

    scraper.add_extraction_rule(ExtractionRule(
        name="name",
        selector="h1.product-title",
        required=True
    ))

    scraper.add_extraction_rule(ExtractionRule(
        name="price",
        selector=".price .current",
        transform="float"
    ))

    scraper.add_extraction_rule(ExtractionRule(
        name="rating",
        selector=".rating",
        attribute="data-rating"
    ))

    scraper.add_extraction_rule(ExtractionRule(
        name="reviews",
        selector=".review",
        multiple=True
    ))

    scraper.add_extraction_rule(ExtractionRule(
        name="images",
        selector=".product-image img",
        attribute="src",
        multiple=True
    ))

    result = await scraper.scrape("https://shop.example.com/product/123")

    if result.success:
        product = result.data.extracted_data
        print(f"Product: {product['name']}")
        print(f"Price: ${product['price']}")
        print(f"Rating: {product['rating']}/5")
        print(f"Reviews: {len(product['reviews'])}")
        print(f"Images: {len(product['images'])}")

asyncio.run(scrape_products())

Troubleshooting¶

Common Issues¶

JavaScript Not Loading: Use BrowserScraper instead of HttpScraper
Rate Limiting: Reduce requests_per_second in configuration
Timeouts: Increase default_timeout for slow sites
Memory Usage: Enable streaming for large responses
Blocked Requests: Use different user agents and proxies

Debug Mode¶

from scrap_e.core.config import WebScraperConfig

config = WebScraperConfig(
    debug=True,
    log_level="DEBUG",
    browser_headless=False,  # See browser in action
    browser_screenshot_on_error=True
)

scraper = BrowserScraper(config)
# Detailed logs and error screenshots will be generated

Next Steps¶

Explore API scraping for REST and GraphQL APIs
Learn about database scraping for direct data access
Check out file processing for local file extraction
Review performance optimization techniques