Skip to content

HTML Parser API

The HTML Parser provides advanced HTML parsing capabilities with multiple backend support for efficient data extraction.

Overview

The HtmlParser class offers a unified interface for HTML parsing using different backends:

  • BeautifulSoup - Python-native, feature-rich
  • lxml - C-based, high performance
  • selectolax - Fastest CSS selector engine (optional)

Class Reference

HtmlParser(html_content, parser_type='lxml')

Advanced HTML parser with multiple backend support.

Source code in src/scrap_e/scrapers/web/parser.py
def __init__(self, html_content: str, parser_type: str = "lxml") -> None:
    self.html_content = html_content
    self.parser_type = parser_type
    self._soup: BeautifulSoup | None = None
    self._lxml_tree: Any | None = None
    self._selectolax_tree: Any | None = None

Attributes

lxml_tree property

Get lxml parser instance.

selectolax_tree property

Get selectolax parser instance.

soup property

Get BeautifulSoup parser instance.

Functions

clean_text(text)

Clean and normalize text.

Source code in src/scrap_e/scrapers/web/parser.py
def clean_text(self, text: str | None) -> str:
    """Clean and normalize text."""
    if text is None:
        return ""

    # Replace multiple whitespaces with single space
    text = re.sub(r"\s+", " ", text)
    # Strip leading/trailing whitespace
    return text.strip()

extract_all_tables()

Extract all tables from HTML.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_all_tables(self) -> list[dict[str, Any]]:
    """Extract all tables from HTML."""
    return [
        self._parse_table(table)
        for table in self.soup.find_all("table")
        if isinstance(table, Tag)
    ]

extract_forms()

Extract form data from HTML.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_forms(self) -> list[dict[str, Any]]:
    """Extract form data from HTML."""
    forms = []

    for form in self.soup.find_all("form"):
        if not isinstance(form, Tag):
            continue
        form_data: dict[str, Any] = {
            "action": str(form.get("action", "")),
            "method": str(form.get("method", "get")).lower(),
            "id": form.get("id"),
            "inputs": [],
        }

        # Extract all input fields
        for input_elem in form.find_all(["input", "select", "textarea"]):
            if not isinstance(input_elem, Tag):
                continue
            input_data: dict[str, Any] = {
                "type": (
                    str(input_elem.get("type", "text"))
                    if input_elem.name == "input"
                    else input_elem.name
                ),
                "name": input_elem.get("name"),
                "id": input_elem.get("id"),
                "value": input_elem.get("value"),
                "placeholder": input_elem.get("placeholder"),
                "required": input_elem.get("required") is not None,
            }

            # For select elements, get options
            if input_elem.name == "select":
                options = []
                for option in input_elem.find_all("option"):
                    if isinstance(option, Tag):
                        options.append(
                            {
                                "value": option.get("value", option.text),
                                "text": option.text,
                            }
                        )
                input_data["options"] = options

            form_data["inputs"].append(input_data)

        forms.append(form_data)

    return forms

extract_images(absolute_url=None)

Extract all images from HTML.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_images(self, absolute_url: str | None = None) -> list[dict[str, str]]:
    """Extract all images from HTML."""
    images = []
    for img in self.soup.find_all("img"):
        if not isinstance(img, Tag):
            continue
        src_attr = img.get("src", "")
        src = src_attr if isinstance(src_attr, str) else ""
        if absolute_url and src:
            src = urljoin(absolute_url, src)

        images.append(
            {
                "src": src,
                "alt": str(img.get("alt", "")),
                "title": str(img.get("title", "")),
                "width": str(img.get("width", "")),
                "height": str(img.get("height", "")),
            }
        )

    return images

Extract all links from HTML.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_links(self, absolute_url: str | None = None) -> list[dict[str, str]]:
    """Extract all links from HTML."""
    links = []
    for link in self.soup.find_all("a", href=True):
        if not isinstance(link, Tag):
            continue
        href = link.get("href")
        if not href:
            continue
        if absolute_url and isinstance(href, str):
            href = urljoin(absolute_url, href)

        if isinstance(href, str):
            links.append(
                {
                    "url": href,
                    "text": (link.get_text(strip=True) if isinstance(link, Tag) else ""),
                    "title": (str(link.get("title", "")) if isinstance(link, Tag) else ""),
                }
            )

    return links

extract_metadata()

Extract common metadata from HTML.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_metadata(self) -> dict[str, Any]:
    """Extract common metadata from HTML."""
    metadata: dict[str, Any] = {
        "title": None,
        "description": None,
        "keywords": None,
        "author": None,
        "language": None,
        "canonical_url": None,
        "og_data": {},
        "twitter_data": {},
        "schema_data": [],
    }

    # Title
    title_tag = self.soup.find("title")
    if title_tag:
        metadata["title"] = title_tag.get_text(strip=True)

    # Meta tags
    for meta in self.soup.find_all("meta"):
        if not isinstance(meta, Tag):
            continue
        name_attr = meta.get("name", "")
        name = name_attr.lower() if isinstance(name_attr, str) else ""
        prop_attr = meta.get("property", "")
        property = prop_attr.lower() if isinstance(prop_attr, str) else ""
        content = meta.get("content", "")

        if name == "description":
            metadata["description"] = content
        elif name == "keywords":
            metadata["keywords"] = content
        elif name == "author":
            metadata["author"] = content
        elif property.startswith("og:"):
            metadata["og_data"][property] = content
            # Also add at top level for backward compatibility
            metadata[property] = content
        elif name.startswith("twitter:"):
            metadata["twitter_data"][name] = content

    # Language
    html_tag = self.soup.find("html")
    if html_tag and isinstance(html_tag, Tag):
        metadata["language"] = html_tag.get("lang")

    # Canonical URL
    canonical = self.soup.find("link", rel="canonical")
    if canonical and isinstance(canonical, Tag):
        metadata["canonical_url"] = canonical.get("href")

    # Schema.org data
    json_scripts = self.soup.find_all("script", type="application/ld+json")
    for script in json_scripts:
        try:
            if isinstance(script, Tag) and script.string:
                data = json.loads(script.string)
            else:
                continue
            metadata["schema_data"].append(data)
        except json.JSONDecodeError:
            continue

    return metadata

extract_structured_data()

Extract all structured data from the page.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_structured_data(self) -> dict[str, Any]:
    """Extract all structured data from the page."""
    return {
        "metadata": self.extract_metadata(),
        "links": self.extract_links(),
        "images": self.extract_images(),
        "tables": self.extract_tables(),
    }

extract_table(selector)

Extract table data from HTML.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_table(self, selector: str) -> dict[str, Any] | None:
    """Extract table data from HTML."""
    table = self.soup.select_one(selector)
    if not table:
        return None

    return self._parse_table(table)

extract_tables()

Extract all tables as structured data.

Source code in src/scrap_e/scrapers/web/parser.py
def extract_tables(self) -> list[list[dict[str, Any] | list[Any]]]:
    """Extract all tables as structured data."""
    tables = []

    for table in self.soup.find_all("table"):
        if not isinstance(table, Tag):
            continue
        headers = []
        rows: list[dict[str, Any] | list[Any]] = []

        # Extract headers
        thead = table.find("thead")
        if thead and isinstance(thead, Tag):
            header_row = thead.find("tr")
            if header_row and isinstance(header_row, Tag):
                headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])]

        # If no thead, try first row
        if not headers:
            first_row = table.find("tr")
            if first_row and isinstance(first_row, Tag):
                potential_headers = first_row.find_all("th")
                if potential_headers:
                    headers = [th.get_text(strip=True) for th in potential_headers]

        # Extract rows
        tbody = table.find("tbody") or table
        if isinstance(tbody, Tag):
            for tr in tbody.find_all("tr"):
                if not isinstance(tr, Tag):
                    continue
                cells = tr.find_all(["td", "th"])
                if headers and len(cells) == len(headers):
                    row: dict[str, Any] | list[Any] = {
                        headers[i]: cell.get_text(strip=True) for i, cell in enumerate(cells)
                    }
                else:
                    row = [cell.get_text(strip=True) for cell in cells]

                if row:
                    rows.append(row)

        if rows:
            tables.append(rows)

    return tables

extract_with_rule(rule)

Extract data using an extraction rule.

Parameters:

Name Type Description Default
rule ExtractionRule

Extraction rule to apply

required

Returns:

Type Description
Any

Extracted data based on the rule

Source code in src/scrap_e/scrapers/web/parser.py
def extract_with_rule(self, rule: ExtractionRule) -> Any:
    """
    Extract data using an extraction rule.

    Args:
        rule: Extraction rule to apply

    Returns:
        Extracted data based on the rule
    """
    try:
        if rule.selector:
            return self._extract_css(rule)
        if rule.xpath:
            return self._extract_xpath(rule)
        if rule.regex:
            return self._extract_regex(rule)
        if rule.json_path:
            return self._extract_json(rule)
        raise ParsingError(f"No extraction method specified in rule: {rule.name}")
    except Exception as e:
        if rule.required:
            raise ParsingError(f"Failed to extract required field '{rule.name}': {e!s}") from e
        return rule.default

Usage Examples

Basic Parsing

from scrap_e.scrapers.web.parser import HtmlParser

html_content = """
<html>
<head><title>Example Page</title></head>
<body>
    <h1 class="main-title">Welcome</h1>
    <p class="content">This is a paragraph.</p>
    <div class="articles">
        <article id="1">First Article</article>
        <article id="2">Second Article</article>
    </div>
</body>
</html>
"""

# Create parser instance
parser = HtmlParser(html_content, parser_type="lxml")

# Access different parser backends
soup = parser.soup  # BeautifulSoup instance
lxml_tree = parser.lxml_tree  # lxml instance
selectolax_tree = parser.selectolax_tree  # selectolax instance (if available)

Extraction Rules

The parser works with ExtractionRule objects to define how data should be extracted:

from scrap_e.core.models import ExtractionRule

# CSS Selector extraction
title_rule = ExtractionRule(
    name="title",
    selector="h1.main-title",
    attribute="text"  # Extract text content
)

# Extract multiple elements
articles_rule = ExtractionRule(
    name="articles",
    selector="article",
    multiple=True,
    attribute="id"  # Extract id attribute
)

# XPath extraction
xpath_rule = ExtractionRule(
    name="content",
    xpath="//p[@class='content']/text()"
)

# Regex extraction
regex_rule = ExtractionRule(
    name="numbers",
    regex=r'\d+',
    multiple=True
)

# Extract data using rules
title = parser.extract_with_rule(title_rule)
articles = parser.extract_with_rule(articles_rule)
content = parser.extract_with_rule(xpath_rule)
numbers = parser.extract_with_rule(regex_rule)

print(f"Title: {title}")          # Title: Welcome
print(f"Articles: {articles}")    # Articles: ['1', '2']
print(f"Content: {content}")      # Content: This is a paragraph.

Advanced Extraction

Required Fields and Defaults

# Required field - raises ParsingError if not found
required_rule = ExtractionRule(
    name="required_title",
    selector="h1.missing-class",
    required=True
)

# With default value
optional_rule = ExtractionRule(
    name="optional_subtitle",
    selector="h2.subtitle",
    default="No subtitle"
)

try:
    title = parser.extract_with_rule(required_rule)
except ParsingError as e:
    print(f"Required field missing: {e}")

subtitle = parser.extract_with_rule(optional_rule)  # Returns "No subtitle"

Attribute Extraction

# Extract different attributes
link_rule = ExtractionRule(
    name="links",
    selector="a",
    attribute="href",
    multiple=True
)

image_rule = ExtractionRule(
    name="image_src",
    selector="img.hero",
    attribute="src"
)

data_rule = ExtractionRule(
    name="data_id",
    selector="div.widget",
    attribute="data-id"
)

# Extract custom attributes
links = parser.extract_with_rule(link_rule)
image_src = parser.extract_with_rule(image_rule)
data_id = parser.extract_with_rule(data_rule)

JSON-LD Extraction

html_with_json = """
<html>
<head>
    <script type="application/ld+json">
    {
        "@context": "https://schema.org",
        "@type": "Article",
        "headline": "How to Parse HTML",
        "author": {"name": "John Doe"},
        "datePublished": "2024-01-15"
    }
    </script>
</head>
<body>...</body>
</html>
"""

parser = HtmlParser(html_with_json)

# Extract JSON-LD data
json_rule = ExtractionRule(
    name="article_data",
    json_path="$.headline"  # JSONPath expression
)

headline = parser.extract_with_rule(json_rule)
print(f"Headline: {headline}")  # Headline: How to Parse HTML

Performance Considerations

Different backends have different performance characteristics:

import time
from scrap_e.scrapers.web.parser import HtmlParser

large_html = "<html>" + "<p>content</p>" * 10000 + "</html>"

# BeautifulSoup - Most features, slower
start = time.time()
parser_bs = HtmlParser(large_html, parser_type="html.parser")
soup_time = time.time() - start

# lxml - Fast and feature-rich
start = time.time()
parser_lxml = HtmlParser(large_html, parser_type="lxml")
lxml_time = time.time() - start

print(f"BeautifulSoup: {soup_time:.3f}s")
print(f"lxml: {lxml_time:.3f}s")

# selectolax - Fastest for CSS selectors
if parser_lxml.selectolax_tree:
    start = time.time()
    selectolax_results = parser_lxml.selectolax_tree.css("p")
    selectolax_time = time.time() - start
    print(f"selectolax: {selectolax_time:.3f}s")

Custom Transform Functions

Apply custom transformations to extracted data:

from scrap_e.core.models import ExtractionRule

def clean_price(value: str) -> float:
    """Convert price string to float."""
    if not value:
        return 0.0
    # Remove currency symbols and convert
    price_str = re.sub(r'[^\d.]', '', value)
    return float(price_str) if price_str else 0.0

def normalize_url(value: str, base_url: str) -> str:
    """Convert relative URL to absolute."""
    from urllib.parse import urljoin
    return urljoin(base_url, value)

# Example HTML with prices
html_content = """
<div class="products">
    <div class="product">
        <span class="price">$19.99</span>
        <a href="/product/1">Product 1</a>
    </div>
    <div class="product">
        <span class="price">€25.00</span>
        <a href="/product/2">Product 2</a>
    </div>
</div>
"""

parser = HtmlParser(html_content)

# Extract and transform prices
price_rule = ExtractionRule(
    name="prices",
    selector=".price",
    multiple=True,
    transform="custom"  # Will be processed by scraper
)

# Extract and transform URLs
url_rule = ExtractionRule(
    name="product_urls",
    selector=".product a",
    attribute="href",
    multiple=True,
    transform="to_absolute_url"  # Built-in transform
)

prices = parser.extract_with_rule(price_rule)
urls = parser.extract_with_rule(url_rule)

Backend Comparison

BeautifulSoup

Advantages:

  • Pure Python, always available
  • Excellent error handling
  • Rich API for tree manipulation
  • Great for complex parsing tasks

Disadvantages:

  • Slower than C-based parsers
  • Memory overhead for large documents

Best for: Complex parsing, data cleaning, error-prone HTML

lxml

Advantages:

  • Fast C-based implementation
  • Full XPath support
  • Good balance of speed and features
  • Memory efficient

Disadvantages:

  • External C dependency
  • Stricter HTML parsing

Best for: High-performance parsing, XPath queries, large documents

selectolax

Advantages:

  • Fastest CSS selector performance
  • Very low memory footprint
  • Simple, focused API

Disadvantages:

  • Limited feature set
  • CSS selectors only (no XPath)
  • Optional dependency

Best for: High-volume CSS selector extraction, performance-critical applications

Error Handling

The parser provides detailed error information:

from scrap_e.core.exceptions import ParsingError

html_content = "<html><body><p>Test</p></body></html>"
parser = HtmlParser(html_content)

try:
    # This will fail because the element doesn't exist
    rule = ExtractionRule(
        name="missing_element",
        selector=".nonexistent",
        required=True
    )

    result = parser.extract_with_rule(rule)

except ParsingError as e:
    print(f"Parsing failed: {e}")
    print(f"Rule name: {e.details.get('rule_name')}")
    print(f"Selector: {e.details.get('selector')}")

Best Practices

1. Choose the Right Backend

# For maximum compatibility
parser = HtmlParser(html, parser_type="html.parser")

# For performance with well-formed HTML
parser = HtmlParser(html, parser_type="lxml")

# For malformed HTML that needs fixing
parser = HtmlParser(html, parser_type="html5lib")

2. Use Specific Selectors

# Good - specific selector
good_rule = ExtractionRule(
    name="product_title",
    selector="h1.product-title"
)

# Less efficient - overly broad selector
broad_rule = ExtractionRule(
    name="title",
    selector="h1"  # Might match multiple elements
)

3. Handle Missing Data Gracefully

# Always provide defaults for optional fields
rule = ExtractionRule(
    name="optional_field",
    selector=".may-not-exist",
    default="N/A",
    required=False
)

4. Optimize for Performance

# Cache parser instances for repeated use
class CachedParser:
    def __init__(self):
        self._parsers = {}

    def get_parser(self, html_content: str) -> HtmlParser:
        content_hash = hash(html_content)
        if content_hash not in self._parsers:
            self._parsers[content_hash] = HtmlParser(html_content)
        return self._parsers[content_hash]

# Use selectolax for CSS-only extraction at scale
if HtmlParser(html).selectolax_tree:
    # Use fast selectolax backend
    pass

5. Test with Real-World HTML

import pytest
from scrap_e.scrapers.web.parser import HtmlParser

def test_parser_with_malformed_html():
    """Test parser handles real-world malformed HTML."""
    malformed_html = "<html><body><p>Unclosed paragraph<div>Nested incorrectly</body></html>"

    parser = HtmlParser(malformed_html, parser_type="html5lib")  # More forgiving

    rule = ExtractionRule(name="text", selector="p")
    result = parser.extract_with_rule(rule)

    assert result == "Unclosed paragraph"

The HTML Parser is a core component that powers all HTML-based data extraction in Scrap-E, providing flexibility and performance for various scraping scenarios.