Files
trends-scraper/src/scraper.py
2025-09-11 17:46:14 +03:00

618 lines
23 KiB
Python

"""
Web Scraper Module
This module contains the core functionality for scraping e-commerce websites
to collect product and pricing data for market trend analysis.
"""
import time
import json
import random
from datetime import datetime
from typing import Dict, List, Any, Optional, Union
from pathlib import Path
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from logger import LoggerMixin
from config_manager import ConfigManager
class MarketTrendsScraper(LoggerMixin):
"""
Main scraper class for collecting market trends data from e-commerce websites.
"""
def __init__(self, config: Dict[str, Any], headless: bool = True):
"""
Initialize the scraper with configuration.
Args:
config: Configuration dictionary
headless: Whether to run browser in headless mode
"""
self.config = config
self.driver = None
self.session = requests.Session()
self.data = []
self._setup_browser(headless)
self._setup_session()
def _setup_browser(self, headless: bool = True) -> None:
"""
Set up the Selenium WebDriver with appropriate options.
Args:
headless: Whether to run browser in headless mode
"""
try:
chrome_options = Options()
# Set headless mode
if headless:
chrome_options.add_argument("--headless")
# Set window size
window_size = self.config.get("scraper.window_size", [1920, 1080])
chrome_options.add_argument(f"--window-size={window_size[0]},{window_size[1]}")
# Add other options for stability
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-web-security")
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
# Set user agent
user_agent = self.config.get("scraper.user_agent", "")
if user_agent:
chrome_options.add_argument(f"--user-agent={user_agent}")
# Initialize driver
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.set_page_load_timeout(self.config.get("scraper.timeout", 30))
self.logger.info("Browser setup completed")
except Exception as e:
self.logger.error(f"Failed to setup browser: {str(e)}")
raise
def _setup_session(self) -> None:
"""
Set up the requests session with appropriate headers.
"""
user_agent = self.config.get("scraper.user_agent", "")
if user_agent:
self.session.headers.update({"User-Agent": user_agent})
# Add other headers
self.session.headers.update({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
})
self.logger.info("Session setup completed")
def scrape_market_trends(self) -> List[Dict[str, Any]]:
"""
Scrape market trends data from all configured sources.
Returns:
List of dictionaries containing scraped data
"""
all_data = []
sources = self.config.get("sources", [])
for source in sources:
if not source.get("enabled", True):
self.logger.info(f"Skipping disabled source: {source.get('name', 'Unknown')}")
continue
self.logger.info(f"Scraping source: {source.get('name', 'Unknown')}")
try:
source_data = self._scrape_source(source)
all_data.extend(source_data)
self.logger.info(f"Scraped {len(source_data)} records from {source.get('name', 'Unknown')}")
except Exception as e:
self.logger.error(f"Failed to scrape source {source.get('name', 'Unknown')}: {str(e)}")
continue
return all_data
def _scrape_source(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Scrape data from a specific source.
Args:
source: Source configuration dictionary
Returns:
List of dictionaries containing scraped data
"""
source_data = []
url = source.get("url", "")
selectors = source.get("selectors", {})
pagination = source.get("pagination", {})
if not url:
self.logger.warning(f"No URL configured for source: {source.get('name', 'Unknown')}")
return source_data
# Determine scraping method
use_selenium = source.get("use_selenium", False)
if use_selenium:
return self._scrape_with_selenium(source)
else:
return self._scrape_with_requests(source)
def _scrape_with_requests(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Scrape data using requests and BeautifulSoup.
Args:
source: Source configuration dictionary
Returns:
List of dictionaries containing scraped data
"""
source_data = []
url = source.get("url", "")
selectors = source.get("selectors", {})
pagination = source.get("pagination", {})
max_pages = pagination.get("max_pages", 1)
for page in range(1, max_pages + 1):
try:
# Add page parameter if needed
page_url = url
if page > 1:
page_url = f"{url}?page={page}"
self.logger.debug(f"Scraping page {page}: {page_url}")
# Make request with retry logic
response = self._make_request_with_retry(page_url)
if not response:
continue
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Extract product data
products = soup.select(selectors.get("product", ""))
for product in products:
product_data = self._extract_product_data(product, selectors)
if product_data:
product_data["source"] = source.get("name", "Unknown")
product_data["scraped_at"] = datetime.now().isoformat()
source_data.append(product_data)
# Check if there's a next page
if page < max_pages:
next_page = soup.select_one(pagination.get("next_page", ""))
if not next_page:
self.logger.debug(f"No more pages found after page {page}")
break
# Delay between requests
delay = self.config.get("scraper.delay_between_requests", 1.0)
time.sleep(delay + random.uniform(0, 1))
except Exception as e:
self.logger.error(f"Error scraping page {page} from {source.get('name', 'Unknown')}: {str(e)}")
continue
return source_data
def _scrape_with_selenium(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Scrape data using Selenium WebDriver.
Args:
source: Source configuration dictionary
Returns:
List of dictionaries containing scraped data
"""
source_data = []
url = source.get("url", "")
selectors = source.get("selectors", {})
pagination = source.get("pagination", {})
max_pages = pagination.get("max_pages", 1)
try:
self.driver.get(url)
for page in range(1, max_pages + 1):
self.logger.debug(f"Scraping page {page} with Selenium")
# Wait for products to load
try:
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, selectors.get("product", "")))
)
except TimeoutException:
self.logger.warning(f"Timeout waiting for products to load on page {page}")
continue
# Extract product data
products = self.driver.find_elements(By.CSS_SELECTOR, selectors.get("product", ""))
for product in products:
product_data = self._extract_product_data_selenium(product, selectors)
if product_data:
product_data["source"] = source.get("name", "Unknown")
product_data["scraped_at"] = datetime.now().isoformat()
source_data.append(product_data)
# Navigate to next page if available
if page < max_pages:
try:
next_button = self.driver.find_element(By.CSS_SELECTOR, pagination.get("next_page", ""))
next_button.click()
# Wait for page to load
time.sleep(2)
except NoSuchElementException:
self.logger.debug(f"No next page button found after page {page}")
break
# Delay between requests
delay = self.config.get("scraper.delay_between_requests", 1.0)
time.sleep(delay + random.uniform(0, 1))
except Exception as e:
self.logger.error(f"Error scraping with Selenium from {source.get('name', 'Unknown')}: {str(e)}")
return source_data
def _make_request_with_retry(self, url: str) -> Optional[requests.Response]:
"""
Make HTTP request with retry logic.
Args:
url: URL to request
Returns:
Response object or None if failed
"""
max_retries = self.config.get("scraper.max_retries", 3)
timeout = self.config.get("scraper.timeout", 30)
for attempt in range(max_retries):
try:
response = self.session.get(url, timeout=timeout)
response.raise_for_status()
return response
except requests.RequestException as e:
self.logger.warning(f"Request attempt {attempt + 1} failed for {url}: {str(e)}")
if attempt < max_retries - 1:
# Exponential backoff
time.sleep((2 ** attempt) + random.uniform(0, 1))
else:
self.logger.error(f"Max retries exceeded for {url}")
return None
return None
def _extract_product_data(self, product: BeautifulSoup, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
"""
Extract product data from HTML element using BeautifulSoup.
Args:
product: BeautifulSoup element containing product data
selectors: Dictionary of CSS selectors
Returns:
Dictionary containing product data or None if extraction failed
"""
try:
data = {}
# Extract name
name_element = product.select_one(selectors.get("name", ""))
data["name"] = name_element.get_text(strip=True) if name_element else None
# Extract price
price_element = product.select_one(selectors.get("price", ""))
if price_element:
price_text = price_element.get_text(strip=True)
data["price"] = self._parse_price(price_text)
else:
data["price"] = None
# Extract rating
rating_element = product.select_one(selectors.get("rating", ""))
if rating_element:
rating_text = rating_element.get_text(strip=True)
data["rating"] = self._parse_rating(rating_text)
else:
data["rating"] = None
# Extract availability
availability_element = product.select_one(selectors.get("availability", ""))
data["availability"] = availability_element.get_text(strip=True) if availability_element else None
# Extract URL if available
link_element = product.select_one("a")
if link_element and link_element.get("href"):
data["url"] = link_element.get("href")
return data if data.get("name") else None
except Exception as e:
self.logger.error(f"Error extracting product data: {str(e)}")
return None
def _extract_product_data_selenium(self, product, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
"""
Extract product data from Selenium WebElement.
Args:
product: Selenium WebElement containing product data
selectors: Dictionary of CSS selectors
Returns:
Dictionary containing product data or None if extraction failed
"""
try:
data = {}
# Extract name
try:
name_element = product.find_element(By.CSS_SELECTOR, selectors.get("name", ""))
data["name"] = name_element.text.strip()
except NoSuchElementException:
data["name"] = None
# Extract price
try:
price_element = product.find_element(By.CSS_SELECTOR, selectors.get("price", ""))
price_text = price_element.text.strip()
data["price"] = self._parse_price(price_text)
except NoSuchElementException:
data["price"] = None
# Extract rating
try:
rating_element = product.find_element(By.CSS_SELECTOR, selectors.get("rating", ""))
rating_text = rating_element.text.strip()
data["rating"] = self._parse_rating(rating_text)
except NoSuchElementException:
data["rating"] = None
# Extract availability
try:
availability_element = product.find_element(By.CSS_SELECTOR, selectors.get("availability", ""))
data["availability"] = availability_element.text.strip()
except NoSuchElementException:
data["availability"] = None
# Extract URL if available
try:
link_element = product.find_element(By.CSS_SELECTOR, "a")
if link_element.get_attribute("href"):
data["url"] = link_element.get_attribute("href")
except NoSuchElementException:
pass
return data if data.get("name") else None
except Exception as e:
self.logger.error(f"Error extracting product data with Selenium: {str(e)}")
return None
def _parse_price(self, price_text: str) -> Optional[float]:
"""
Parse price text to extract numeric value.
Args:
price_text: Raw price text
Returns:
Parsed price as float or None if parsing failed
"""
try:
# Remove currency symbols and whitespace
price_clean = price_text.replace('$', '').replace('', '').replace('£', '').strip()
# Handle comma as decimal separator
price_clean = price_clean.replace(',', '.')
# Extract numeric part
import re
price_match = re.search(r'[\d.]+', price_clean)
if price_match:
return float(price_match.group())
return None
except Exception as e:
self.logger.error(f"Error parsing price '{price_text}': {str(e)}")
return None
def _parse_rating(self, rating_text: str) -> Optional[float]:
"""
Parse rating text to extract numeric value.
Args:
rating_text: Raw rating text
Returns:
Parsed rating as float or None if parsing failed
"""
try:
# Extract numeric part
import re
rating_match = re.search(r'[\d.]+', rating_text)
if rating_match:
return float(rating_match.group())
return None
except Exception as e:
self.logger.error(f"Error parsing rating '{rating_text}': {str(e)}")
return None
def save_data(self, data: List[Dict[str, Any]], output_path: str) -> None:
"""
Save scraped data to file.
Args:
data: List of dictionaries containing scraped data
output_path: Path to output file
"""
try:
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
# Convert to DataFrame
df = pd.DataFrame(data)
# Save based on file extension
if output_path.endswith('.csv'):
df.to_csv(output_path, index=False)
elif output_path.endswith('.json'):
df.to_json(output_path, orient='records', indent=2)
elif output_path.endswith('.xlsx'):
df.to_excel(output_path, index=False)
else:
# Default to CSV
output_path = output_path + '.csv'
df.to_csv(output_path, index=False)
self.logger.info(f"Data saved to {output_path}")
except Exception as e:
self.logger.error(f"Error saving data to {output_path}: {str(e)}")
raise
def analyze_trends(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Analyze market trends from scraped data.
Args:
data: List of dictionaries containing scraped data
Returns:
Dictionary containing trend analysis results
"""
try:
if not data:
return {"error": "No data available for analysis"}
# Convert to DataFrame
df = pd.DataFrame(data)
# Basic statistics
analysis = {
"total_products": len(df),
"sources": df["source"].value_counts().to_dict(),
"price_analysis": {},
"rating_analysis": {},
"availability_analysis": {}
}
# Price analysis
if "price" in df.columns:
price_data = df["price"].dropna()
if not price_data.empty:
analysis["price_analysis"] = {
"average_price": float(price_data.mean()),
"min_price": float(price_data.min()),
"max_price": float(price_data.max()),
"median_price": float(price_data.median()),
"price_distribution": price_data.describe().to_dict()
}
# Rating analysis
if "rating" in df.columns:
rating_data = df["rating"].dropna()
if not rating_data.empty:
analysis["rating_analysis"] = {
"average_rating": float(rating_data.mean()),
"min_rating": float(rating_data.min()),
"max_rating": float(rating_data.max()),
"rating_distribution": rating_data.value_counts().to_dict()
}
# Availability analysis
if "availability" in df.columns:
availability_data = df["availability"].dropna()
if not availability_data.empty:
analysis["availability_analysis"] = availability_data.value_counts().to_dict()
# Price trends by source
if "price" in df.columns and "source" in df.columns:
price_by_source = df.groupby("source")["price"].agg(["mean", "min", "max"]).to_dict()
analysis["price_by_source"] = price_by_source
self.logger.info("Trend analysis completed")
return analysis
except Exception as e:
self.logger.error(f"Error analyzing trends: {str(e)}")
return {"error": str(e)}
def save_analysis(self, analysis: Dict[str, Any], output_path: str) -> None:
"""
Save trend analysis results to file.
Args:
analysis: Dictionary containing analysis results
output_path: Path to output file
"""
try:
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(analysis, f, indent=2, ensure_ascii=False)
self.logger.info(f"Analysis saved to {output_path}")
except Exception as e:
self.logger.error(f"Error saving analysis to {output_path}: {str(e)}")
raise
def close(self) -> None:
"""
Close browser and session.
"""
try:
if self.driver:
self.driver.quit()
self.driver = None
if self.session:
self.session.close()
self.logger.info("Browser and session closed")
except Exception as e:
self.logger.error(f"Error closing browser/session: {str(e)}")
def __enter__(self):
"""Context manager entry."""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit."""
self.close()