Skip to content

Commit 9b5ccac

Browse files
committed
feat(extraction): add RegexExtractionStrategy for pattern-based extraction
Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
1 parent 94e9959 commit 9b5ccac

13 files changed

+984
-124
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,21 @@ All notable changes to Crawl4AI will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.6.2] - 2025-05-02
9+
10+
### Added
11+
- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
12+
- Built-in patterns for emails, URLs, phone numbers, dates, and more
13+
- Support for custom regex patterns
14+
- `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
15+
- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
16+
- Added support for network response body capture in network request tracking
17+
18+
### Changed
19+
- Updated documentation for no-LLM extraction strategies
20+
- Enhanced API reference to include RegexExtractionStrategy examples and usage
21+
- Improved HTML preprocessing with optimized performance for extraction strategies
22+
823
## [0.6.1] - 2025-04-24
924

1025
### Added

crawl4ai/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@
2323
CosineStrategy,
2424
JsonCssExtractionStrategy,
2525
JsonXPathExtractionStrategy,
26-
JsonLxmlExtractionStrategy
26+
JsonLxmlExtractionStrategy,
27+
RegexExtractionStrategy
2728
)
2829
from .chunking_strategy import ChunkingStrategy, RegexChunking
2930
from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -105,6 +106,7 @@
105106
"JsonCssExtractionStrategy",
106107
"JsonXPathExtractionStrategy",
107108
"JsonLxmlExtractionStrategy",
109+
"RegexExtractionStrategy",
108110
"ChunkingStrategy",
109111
"RegexChunking",
110112
"DefaultMarkdownGenerator",

crawl4ai/async_crawler_strategy.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,14 @@ async def handle_request_capture(request):
571571

572572
async def handle_response_capture(response):
573573
try:
574+
try:
575+
# body = await response.body()
576+
# json_body = await response.json()
577+
text_body = await response.text()
578+
except Exception as e:
579+
body = None
580+
# json_body = None
581+
# text_body = None
574582
captured_requests.append({
575583
"event_type": "response",
576584
"url": response.url,
@@ -579,7 +587,12 @@ async def handle_response_capture(response):
579587
"headers": dict(response.headers), # Convert Header dict
580588
"from_service_worker": response.from_service_worker,
581589
"request_timing": response.request.timing, # Detailed timing info
582-
"timestamp": time.time()
590+
"timestamp": time.time(),
591+
"body" : {
592+
# "raw": body,
593+
# "json": json_body,
594+
"text": text_body
595+
}
583596
})
584597
except Exception as e:
585598
if self.logger:

crawl4ai/async_webcrawler.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,8 @@ async def aprocess_html(
503503
tables = media.pop("tables", [])
504504
links = result.links.model_dump()
505505
metadata = result.metadata
506+
507+
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
506508

507509
################################
508510
# Generate Markdown #
@@ -519,7 +521,7 @@ async def aprocess_html(
519521
html_source_selector = {
520522
"raw_html": lambda: html, # The original raw HTML
521523
"cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy
522-
"fit_html": lambda: preprocess_html_for_schema(html_content=html), # Preprocessed raw HTML
524+
"fit_html": lambda: fit_html, # The HTML after preprocessing for schema
523525
}
524526

525527
markdown_input_html = cleaned_html # Default to cleaned_html
@@ -593,14 +595,15 @@ async def aprocess_html(
593595
content = {
594596
"markdown": markdown_result.raw_markdown,
595597
"html": html,
598+
"fit_html": fit_html,
596599
"cleaned_html": cleaned_html,
597600
"fit_markdown": markdown_result.fit_markdown,
598601
}.get(content_format, markdown_result.raw_markdown)
599602

600603
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
601604
chunking = (
602605
IdentityChunking()
603-
if content_format in ["html", "cleaned_html"]
606+
if content_format in ["html", "cleaned_html", "fit_html"]
604607
else config.chunking_strategy
605608
)
606609
sections = chunking.chunk(content)
@@ -624,6 +627,7 @@ async def aprocess_html(
624627
return CrawlResult(
625628
url=url,
626629
html=html,
630+
fit_html=fit_html,
627631
cleaned_html=cleaned_html,
628632
markdown=markdown_result,
629633
media=media,

crawl4ai/browser_profiler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ async def my_crawl_function(profile_path, url):
475475
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
476476
continue
477477

478-
# Print profile information with colorama formatting
478+
# Print profile information
479479
self.logger.info("\nAvailable profiles:", tag="PROFILES")
480480
for i, profile in enumerate(profiles):
481481
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")

0 commit comments

Comments
 (0)