618 lines
23 KiB
Python
618 lines
23 KiB
Python
"""
|
|
Web Scraper Module
|
|
|
|
This module contains the core functionality for scraping e-commerce websites
|
|
to collect product and pricing data for market trend analysis.
|
|
"""
|
|
|
|
import time
|
|
import json
|
|
import random
|
|
from datetime import datetime
|
|
from typing import Dict, List, Any, Optional, Union
|
|
from pathlib import Path
|
|
from urllib.parse import urljoin, urlparse
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
from selenium.webdriver.chrome.options import Options
|
|
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
|
|
|
from logger import LoggerMixin
|
|
from config_manager import ConfigManager
|
|
|
|
|
|
class MarketTrendsScraper(LoggerMixin):
|
|
"""
|
|
Main scraper class for collecting market trends data from e-commerce websites.
|
|
"""
|
|
|
|
def __init__(self, config: Dict[str, Any], headless: bool = True):
|
|
"""
|
|
Initialize the scraper with configuration.
|
|
|
|
Args:
|
|
config: Configuration dictionary
|
|
headless: Whether to run browser in headless mode
|
|
"""
|
|
self.config = config
|
|
self.driver = None
|
|
self.session = requests.Session()
|
|
self.data = []
|
|
self._setup_browser(headless)
|
|
self._setup_session()
|
|
|
|
def _setup_browser(self, headless: bool = True) -> None:
|
|
"""
|
|
Set up the Selenium WebDriver with appropriate options.
|
|
|
|
Args:
|
|
headless: Whether to run browser in headless mode
|
|
"""
|
|
try:
|
|
chrome_options = Options()
|
|
|
|
# Set headless mode
|
|
if headless:
|
|
chrome_options.add_argument("--headless")
|
|
|
|
# Set window size
|
|
window_size = self.config.get("scraper.window_size", [1920, 1080])
|
|
chrome_options.add_argument(f"--window-size={window_size[0]},{window_size[1]}")
|
|
|
|
# Add other options for stability
|
|
chrome_options.add_argument("--no-sandbox")
|
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
chrome_options.add_argument("--disable-gpu")
|
|
chrome_options.add_argument("--disable-web-security")
|
|
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
|
|
|
|
# Set user agent
|
|
user_agent = self.config.get("scraper.user_agent", "")
|
|
if user_agent:
|
|
chrome_options.add_argument(f"--user-agent={user_agent}")
|
|
|
|
# Initialize driver
|
|
self.driver = webdriver.Chrome(options=chrome_options)
|
|
self.driver.set_page_load_timeout(self.config.get("scraper.timeout", 30))
|
|
|
|
self.logger.info("Browser setup completed")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to setup browser: {str(e)}")
|
|
raise
|
|
|
|
def _setup_session(self) -> None:
|
|
"""
|
|
Set up the requests session with appropriate headers.
|
|
"""
|
|
user_agent = self.config.get("scraper.user_agent", "")
|
|
if user_agent:
|
|
self.session.headers.update({"User-Agent": user_agent})
|
|
|
|
# Add other headers
|
|
self.session.headers.update({
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
"Connection": "keep-alive",
|
|
"Upgrade-Insecure-Requests": "1"
|
|
})
|
|
|
|
self.logger.info("Session setup completed")
|
|
|
|
def scrape_market_trends(self) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape market trends data from all configured sources.
|
|
|
|
Returns:
|
|
List of dictionaries containing scraped data
|
|
"""
|
|
all_data = []
|
|
sources = self.config.get("sources", [])
|
|
|
|
for source in sources:
|
|
if not source.get("enabled", True):
|
|
self.logger.info(f"Skipping disabled source: {source.get('name', 'Unknown')}")
|
|
continue
|
|
|
|
self.logger.info(f"Scraping source: {source.get('name', 'Unknown')}")
|
|
|
|
try:
|
|
source_data = self._scrape_source(source)
|
|
all_data.extend(source_data)
|
|
self.logger.info(f"Scraped {len(source_data)} records from {source.get('name', 'Unknown')}")
|
|
except Exception as e:
|
|
self.logger.error(f"Failed to scrape source {source.get('name', 'Unknown')}: {str(e)}")
|
|
continue
|
|
|
|
return all_data
|
|
|
|
def _scrape_source(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape data from a specific source.
|
|
|
|
Args:
|
|
source: Source configuration dictionary
|
|
|
|
Returns:
|
|
List of dictionaries containing scraped data
|
|
"""
|
|
source_data = []
|
|
url = source.get("url", "")
|
|
selectors = source.get("selectors", {})
|
|
pagination = source.get("pagination", {})
|
|
|
|
if not url:
|
|
self.logger.warning(f"No URL configured for source: {source.get('name', 'Unknown')}")
|
|
return source_data
|
|
|
|
# Determine scraping method
|
|
use_selenium = source.get("use_selenium", False)
|
|
|
|
if use_selenium:
|
|
return self._scrape_with_selenium(source)
|
|
else:
|
|
return self._scrape_with_requests(source)
|
|
|
|
def _scrape_with_requests(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape data using requests and BeautifulSoup.
|
|
|
|
Args:
|
|
source: Source configuration dictionary
|
|
|
|
Returns:
|
|
List of dictionaries containing scraped data
|
|
"""
|
|
source_data = []
|
|
url = source.get("url", "")
|
|
selectors = source.get("selectors", {})
|
|
pagination = source.get("pagination", {})
|
|
max_pages = pagination.get("max_pages", 1)
|
|
|
|
for page in range(1, max_pages + 1):
|
|
try:
|
|
# Add page parameter if needed
|
|
page_url = url
|
|
if page > 1:
|
|
page_url = f"{url}?page={page}"
|
|
|
|
self.logger.debug(f"Scraping page {page}: {page_url}")
|
|
|
|
# Make request with retry logic
|
|
response = self._make_request_with_retry(page_url)
|
|
|
|
if not response:
|
|
continue
|
|
|
|
# Parse HTML
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Extract product data
|
|
products = soup.select(selectors.get("product", ""))
|
|
|
|
for product in products:
|
|
product_data = self._extract_product_data(product, selectors)
|
|
if product_data:
|
|
product_data["source"] = source.get("name", "Unknown")
|
|
product_data["scraped_at"] = datetime.now().isoformat()
|
|
source_data.append(product_data)
|
|
|
|
# Check if there's a next page
|
|
if page < max_pages:
|
|
next_page = soup.select_one(pagination.get("next_page", ""))
|
|
if not next_page:
|
|
self.logger.debug(f"No more pages found after page {page}")
|
|
break
|
|
|
|
# Delay between requests
|
|
delay = self.config.get("scraper.delay_between_requests", 1.0)
|
|
time.sleep(delay + random.uniform(0, 1))
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error scraping page {page} from {source.get('name', 'Unknown')}: {str(e)}")
|
|
continue
|
|
|
|
return source_data
|
|
|
|
def _scrape_with_selenium(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Scrape data using Selenium WebDriver.
|
|
|
|
Args:
|
|
source: Source configuration dictionary
|
|
|
|
Returns:
|
|
List of dictionaries containing scraped data
|
|
"""
|
|
source_data = []
|
|
url = source.get("url", "")
|
|
selectors = source.get("selectors", {})
|
|
pagination = source.get("pagination", {})
|
|
max_pages = pagination.get("max_pages", 1)
|
|
|
|
try:
|
|
self.driver.get(url)
|
|
|
|
for page in range(1, max_pages + 1):
|
|
self.logger.debug(f"Scraping page {page} with Selenium")
|
|
|
|
# Wait for products to load
|
|
try:
|
|
WebDriverWait(self.driver, 10).until(
|
|
EC.presence_of_element_located((By.CSS_SELECTOR, selectors.get("product", "")))
|
|
)
|
|
except TimeoutException:
|
|
self.logger.warning(f"Timeout waiting for products to load on page {page}")
|
|
continue
|
|
|
|
# Extract product data
|
|
products = self.driver.find_elements(By.CSS_SELECTOR, selectors.get("product", ""))
|
|
|
|
for product in products:
|
|
product_data = self._extract_product_data_selenium(product, selectors)
|
|
if product_data:
|
|
product_data["source"] = source.get("name", "Unknown")
|
|
product_data["scraped_at"] = datetime.now().isoformat()
|
|
source_data.append(product_data)
|
|
|
|
# Navigate to next page if available
|
|
if page < max_pages:
|
|
try:
|
|
next_button = self.driver.find_element(By.CSS_SELECTOR, pagination.get("next_page", ""))
|
|
next_button.click()
|
|
|
|
# Wait for page to load
|
|
time.sleep(2)
|
|
except NoSuchElementException:
|
|
self.logger.debug(f"No next page button found after page {page}")
|
|
break
|
|
|
|
# Delay between requests
|
|
delay = self.config.get("scraper.delay_between_requests", 1.0)
|
|
time.sleep(delay + random.uniform(0, 1))
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error scraping with Selenium from {source.get('name', 'Unknown')}: {str(e)}")
|
|
|
|
return source_data
|
|
|
|
def _make_request_with_retry(self, url: str) -> Optional[requests.Response]:
|
|
"""
|
|
Make HTTP request with retry logic.
|
|
|
|
Args:
|
|
url: URL to request
|
|
|
|
Returns:
|
|
Response object or None if failed
|
|
"""
|
|
max_retries = self.config.get("scraper.max_retries", 3)
|
|
timeout = self.config.get("scraper.timeout", 30)
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
response = self.session.get(url, timeout=timeout)
|
|
response.raise_for_status()
|
|
return response
|
|
except requests.RequestException as e:
|
|
self.logger.warning(f"Request attempt {attempt + 1} failed for {url}: {str(e)}")
|
|
if attempt < max_retries - 1:
|
|
# Exponential backoff
|
|
time.sleep((2 ** attempt) + random.uniform(0, 1))
|
|
else:
|
|
self.logger.error(f"Max retries exceeded for {url}")
|
|
return None
|
|
|
|
return None
|
|
|
|
def _extract_product_data(self, product: BeautifulSoup, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Extract product data from HTML element using BeautifulSoup.
|
|
|
|
Args:
|
|
product: BeautifulSoup element containing product data
|
|
selectors: Dictionary of CSS selectors
|
|
|
|
Returns:
|
|
Dictionary containing product data or None if extraction failed
|
|
"""
|
|
try:
|
|
data = {}
|
|
|
|
# Extract name
|
|
name_element = product.select_one(selectors.get("name", ""))
|
|
data["name"] = name_element.get_text(strip=True) if name_element else None
|
|
|
|
# Extract price
|
|
price_element = product.select_one(selectors.get("price", ""))
|
|
if price_element:
|
|
price_text = price_element.get_text(strip=True)
|
|
data["price"] = self._parse_price(price_text)
|
|
else:
|
|
data["price"] = None
|
|
|
|
# Extract rating
|
|
rating_element = product.select_one(selectors.get("rating", ""))
|
|
if rating_element:
|
|
rating_text = rating_element.get_text(strip=True)
|
|
data["rating"] = self._parse_rating(rating_text)
|
|
else:
|
|
data["rating"] = None
|
|
|
|
# Extract availability
|
|
availability_element = product.select_one(selectors.get("availability", ""))
|
|
data["availability"] = availability_element.get_text(strip=True) if availability_element else None
|
|
|
|
# Extract URL if available
|
|
link_element = product.select_one("a")
|
|
if link_element and link_element.get("href"):
|
|
data["url"] = link_element.get("href")
|
|
|
|
return data if data.get("name") else None
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error extracting product data: {str(e)}")
|
|
return None
|
|
|
|
def _extract_product_data_selenium(self, product, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Extract product data from Selenium WebElement.
|
|
|
|
Args:
|
|
product: Selenium WebElement containing product data
|
|
selectors: Dictionary of CSS selectors
|
|
|
|
Returns:
|
|
Dictionary containing product data or None if extraction failed
|
|
"""
|
|
try:
|
|
data = {}
|
|
|
|
# Extract name
|
|
try:
|
|
name_element = product.find_element(By.CSS_SELECTOR, selectors.get("name", ""))
|
|
data["name"] = name_element.text.strip()
|
|
except NoSuchElementException:
|
|
data["name"] = None
|
|
|
|
# Extract price
|
|
try:
|
|
price_element = product.find_element(By.CSS_SELECTOR, selectors.get("price", ""))
|
|
price_text = price_element.text.strip()
|
|
data["price"] = self._parse_price(price_text)
|
|
except NoSuchElementException:
|
|
data["price"] = None
|
|
|
|
# Extract rating
|
|
try:
|
|
rating_element = product.find_element(By.CSS_SELECTOR, selectors.get("rating", ""))
|
|
rating_text = rating_element.text.strip()
|
|
data["rating"] = self._parse_rating(rating_text)
|
|
except NoSuchElementException:
|
|
data["rating"] = None
|
|
|
|
# Extract availability
|
|
try:
|
|
availability_element = product.find_element(By.CSS_SELECTOR, selectors.get("availability", ""))
|
|
data["availability"] = availability_element.text.strip()
|
|
except NoSuchElementException:
|
|
data["availability"] = None
|
|
|
|
# Extract URL if available
|
|
try:
|
|
link_element = product.find_element(By.CSS_SELECTOR, "a")
|
|
if link_element.get_attribute("href"):
|
|
data["url"] = link_element.get_attribute("href")
|
|
except NoSuchElementException:
|
|
pass
|
|
|
|
return data if data.get("name") else None
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error extracting product data with Selenium: {str(e)}")
|
|
return None
|
|
|
|
def _parse_price(self, price_text: str) -> Optional[float]:
|
|
"""
|
|
Parse price text to extract numeric value.
|
|
|
|
Args:
|
|
price_text: Raw price text
|
|
|
|
Returns:
|
|
Parsed price as float or None if parsing failed
|
|
"""
|
|
try:
|
|
# Remove currency symbols and whitespace
|
|
price_clean = price_text.replace('$', '').replace('€', '').replace('£', '').strip()
|
|
|
|
# Handle comma as decimal separator
|
|
price_clean = price_clean.replace(',', '.')
|
|
|
|
# Extract numeric part
|
|
import re
|
|
price_match = re.search(r'[\d.]+', price_clean)
|
|
if price_match:
|
|
return float(price_match.group())
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error parsing price '{price_text}': {str(e)}")
|
|
return None
|
|
|
|
def _parse_rating(self, rating_text: str) -> Optional[float]:
|
|
"""
|
|
Parse rating text to extract numeric value.
|
|
|
|
Args:
|
|
rating_text: Raw rating text
|
|
|
|
Returns:
|
|
Parsed rating as float or None if parsing failed
|
|
"""
|
|
try:
|
|
# Extract numeric part
|
|
import re
|
|
rating_match = re.search(r'[\d.]+', rating_text)
|
|
if rating_match:
|
|
return float(rating_match.group())
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error parsing rating '{rating_text}': {str(e)}")
|
|
return None
|
|
|
|
def save_data(self, data: List[Dict[str, Any]], output_path: str) -> None:
|
|
"""
|
|
Save scraped data to file.
|
|
|
|
Args:
|
|
data: List of dictionaries containing scraped data
|
|
output_path: Path to output file
|
|
"""
|
|
try:
|
|
output_file = Path(output_path)
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(data)
|
|
|
|
# Save based on file extension
|
|
if output_path.endswith('.csv'):
|
|
df.to_csv(output_path, index=False)
|
|
elif output_path.endswith('.json'):
|
|
df.to_json(output_path, orient='records', indent=2)
|
|
elif output_path.endswith('.xlsx'):
|
|
df.to_excel(output_path, index=False)
|
|
else:
|
|
# Default to CSV
|
|
output_path = output_path + '.csv'
|
|
df.to_csv(output_path, index=False)
|
|
|
|
self.logger.info(f"Data saved to {output_path}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error saving data to {output_path}: {str(e)}")
|
|
raise
|
|
|
|
def analyze_trends(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
"""
|
|
Analyze market trends from scraped data.
|
|
|
|
Args:
|
|
data: List of dictionaries containing scraped data
|
|
|
|
Returns:
|
|
Dictionary containing trend analysis results
|
|
"""
|
|
try:
|
|
if not data:
|
|
return {"error": "No data available for analysis"}
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(data)
|
|
|
|
# Basic statistics
|
|
analysis = {
|
|
"total_products": len(df),
|
|
"sources": df["source"].value_counts().to_dict(),
|
|
"price_analysis": {},
|
|
"rating_analysis": {},
|
|
"availability_analysis": {}
|
|
}
|
|
|
|
# Price analysis
|
|
if "price" in df.columns:
|
|
price_data = df["price"].dropna()
|
|
if not price_data.empty:
|
|
analysis["price_analysis"] = {
|
|
"average_price": float(price_data.mean()),
|
|
"min_price": float(price_data.min()),
|
|
"max_price": float(price_data.max()),
|
|
"median_price": float(price_data.median()),
|
|
"price_distribution": price_data.describe().to_dict()
|
|
}
|
|
|
|
# Rating analysis
|
|
if "rating" in df.columns:
|
|
rating_data = df["rating"].dropna()
|
|
if not rating_data.empty:
|
|
analysis["rating_analysis"] = {
|
|
"average_rating": float(rating_data.mean()),
|
|
"min_rating": float(rating_data.min()),
|
|
"max_rating": float(rating_data.max()),
|
|
"rating_distribution": rating_data.value_counts().to_dict()
|
|
}
|
|
|
|
# Availability analysis
|
|
if "availability" in df.columns:
|
|
availability_data = df["availability"].dropna()
|
|
if not availability_data.empty:
|
|
analysis["availability_analysis"] = availability_data.value_counts().to_dict()
|
|
|
|
# Price trends by source
|
|
if "price" in df.columns and "source" in df.columns:
|
|
price_by_source = df.groupby("source")["price"].agg(["mean", "min", "max"]).to_dict()
|
|
analysis["price_by_source"] = price_by_source
|
|
|
|
self.logger.info("Trend analysis completed")
|
|
return analysis
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error analyzing trends: {str(e)}")
|
|
return {"error": str(e)}
|
|
|
|
def save_analysis(self, analysis: Dict[str, Any], output_path: str) -> None:
|
|
"""
|
|
Save trend analysis results to file.
|
|
|
|
Args:
|
|
analysis: Dictionary containing analysis results
|
|
output_path: Path to output file
|
|
"""
|
|
try:
|
|
output_file = Path(output_path)
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(analysis, f, indent=2, ensure_ascii=False)
|
|
|
|
self.logger.info(f"Analysis saved to {output_path}")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error saving analysis to {output_path}: {str(e)}")
|
|
raise
|
|
|
|
def close(self) -> None:
|
|
"""
|
|
Close browser and session.
|
|
"""
|
|
try:
|
|
if self.driver:
|
|
self.driver.quit()
|
|
self.driver = None
|
|
|
|
if self.session:
|
|
self.session.close()
|
|
|
|
self.logger.info("Browser and session closed")
|
|
|
|
except Exception as e:
|
|
self.logger.error(f"Error closing browser/session: {str(e)}")
|
|
|
|
def __enter__(self):
|
|
"""Context manager entry."""
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
"""Context manager exit."""
|
|
self.close() |