101 lines
2.8 KiB
YAML
101 lines
2.8 KiB
YAML
# Sample Configuration for Market Trends Scraper
|
|
# Copy this file to config.yaml and customize for your needs
|
|
|
|
scraper:
|
|
# Delay between requests in seconds (helps avoid being blocked)
|
|
delay_between_requests: 1.5
|
|
|
|
# Request timeout in seconds
|
|
timeout: 30
|
|
|
|
# Maximum number of retry attempts for failed requests
|
|
max_retries: 3
|
|
|
|
# User agent string for HTTP requests
|
|
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
|
|
|
# Run browser in headless mode (no visible UI)
|
|
headless: true
|
|
|
|
# Browser window size [width, height]
|
|
window_size: [1920, 1080]
|
|
|
|
# List of data sources to scrape
|
|
sources:
|
|
# Example: Amazon (fictional selectors for demonstration)
|
|
- name: "amazon"
|
|
url: "https://www.amazon.com/s?k=laptop"
|
|
type: "ecommerce"
|
|
enabled: false # Set to true to enable this source
|
|
use_selenium: true # Amazon often requires JavaScript rendering
|
|
selectors:
|
|
product: "div[data-component-type='s-search-result']"
|
|
name: "h2 span.a-text-normal"
|
|
price: "span.a-price-whole"
|
|
rating: "span.a-icon-alt"
|
|
availability: "span.a-color-success"
|
|
pagination:
|
|
next_page: "a.s-pagination-next"
|
|
max_pages: 5
|
|
|
|
# Example: Generic e-commerce site
|
|
- name: "example_ecommerce"
|
|
url: "https://example-ecommerce.com/search?q=phone"
|
|
type: "ecommerce"
|
|
enabled: true
|
|
use_selenium: false
|
|
selectors:
|
|
product: "div.product-card"
|
|
name: "h3.product-title"
|
|
price: "span.price"
|
|
rating: "div.rating-stars"
|
|
availability: "div.stock-status"
|
|
pagination:
|
|
next_page: "a.pagination-next"
|
|
max_pages: 10
|
|
|
|
# Example: Electronics retailer
|
|
- name: "electronics_store"
|
|
url: "https://example-electronics.com/category/smartphones"
|
|
type: "ecommerce"
|
|
enabled: true
|
|
use_selenium: false
|
|
selectors:
|
|
product: "article.product-item"
|
|
name: "h1.product-name"
|
|
price: "div.current-price"
|
|
rating: "div.product-rating"
|
|
availability: "span.availability-label"
|
|
pagination:
|
|
next_page: "li.page-item.next a"
|
|
max_pages: 3
|
|
|
|
# Output settings
|
|
output:
|
|
# Output format: csv, json, or excel
|
|
format: "csv"
|
|
|
|
# Include timestamp in output filename
|
|
include_timestamp: true
|
|
|
|
# Base filename for output files
|
|
filename: "market_trends_data"
|
|
|
|
# Database settings (for future enhancements)
|
|
database:
|
|
# Database connection URL
|
|
url: "sqlite:///data/market_trends.db"
|
|
|
|
# Enable SQL query logging
|
|
echo: false
|
|
|
|
# Analysis settings
|
|
analysis:
|
|
# Number of days to consider for price history analysis
|
|
price_history_days: 30
|
|
|
|
# Minimum price change percentage to consider as a trend (0.05 = 5%)
|
|
trend_threshold: 0.05
|
|
|
|
# Generate trend charts (requires matplotlib and seaborn)
|
|
generate_charts: true |