Initial commit: Market Trends Scraper

This commit is contained in:
Dev
2025-09-11 17:46:14 +03:00
commit 4ddcde68d4
17 changed files with 3049 additions and 0 deletions

101
config/sample_config.yaml Normal file
View File

@@ -0,0 +1,101 @@
# Sample Configuration for Market Trends Scraper
# Copy this file to config.yaml and customize for your needs
scraper:
# Delay between requests in seconds (helps avoid being blocked)
delay_between_requests: 1.5
# Request timeout in seconds
timeout: 30
# Maximum number of retry attempts for failed requests
max_retries: 3
# User agent string for HTTP requests
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
# Run browser in headless mode (no visible UI)
headless: true
# Browser window size [width, height]
window_size: [1920, 1080]
# List of data sources to scrape
sources:
# Example: Amazon (fictional selectors for demonstration)
- name: "amazon"
url: "https://www.amazon.com/s?k=laptop"
type: "ecommerce"
enabled: false # Set to true to enable this source
use_selenium: true # Amazon often requires JavaScript rendering
selectors:
product: "div[data-component-type='s-search-result']"
name: "h2 span.a-text-normal"
price: "span.a-price-whole"
rating: "span.a-icon-alt"
availability: "span.a-color-success"
pagination:
next_page: "a.s-pagination-next"
max_pages: 5
# Example: Generic e-commerce site
- name: "example_ecommerce"
url: "https://example-ecommerce.com/search?q=phone"
type: "ecommerce"
enabled: true
use_selenium: false
selectors:
product: "div.product-card"
name: "h3.product-title"
price: "span.price"
rating: "div.rating-stars"
availability: "div.stock-status"
pagination:
next_page: "a.pagination-next"
max_pages: 10
# Example: Electronics retailer
- name: "electronics_store"
url: "https://example-electronics.com/category/smartphones"
type: "ecommerce"
enabled: true
use_selenium: false
selectors:
product: "article.product-item"
name: "h1.product-name"
price: "div.current-price"
rating: "div.product-rating"
availability: "span.availability-label"
pagination:
next_page: "li.page-item.next a"
max_pages: 3
# Output settings
output:
# Output format: csv, json, or excel
format: "csv"
# Include timestamp in output filename
include_timestamp: true
# Base filename for output files
filename: "market_trends_data"
# Database settings (for future enhancements)
database:
# Database connection URL
url: "sqlite:///data/market_trends.db"
# Enable SQL query logging
echo: false
# Analysis settings
analysis:
# Number of days to consider for price history analysis
price_history_days: 30
# Minimum price change percentage to consider as a trend (0.05 = 5%)
trend_threshold: 0.05
# Generate trend charts (requires matplotlib and seaborn)
generate_charts: true