""" Web Scraper Module This module contains the core functionality for scraping e-commerce websites to collect product and pricing data for market trend analysis. """ import time import json import random from datetime import datetime from typing import Dict, List, Any, Optional, Union from pathlib import Path from urllib.parse import urljoin, urlparse import requests from bs4 import BeautifulSoup import pandas as pd from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from selenium.common.exceptions import TimeoutException, NoSuchElementException from logger import LoggerMixin from config_manager import ConfigManager class MarketTrendsScraper(LoggerMixin): """ Main scraper class for collecting market trends data from e-commerce websites. """ def __init__(self, config: Dict[str, Any], headless: bool = True): """ Initialize the scraper with configuration. Args: config: Configuration dictionary headless: Whether to run browser in headless mode """ self.config = config self.driver = None self.session = requests.Session() self.data = [] self._setup_browser(headless) self._setup_session() def _setup_browser(self, headless: bool = True) -> None: """ Set up the Selenium WebDriver with appropriate options. Args: headless: Whether to run browser in headless mode """ try: chrome_options = Options() # Set headless mode if headless: chrome_options.add_argument("--headless") # Set window size window_size = self.config.get("scraper.window_size", [1920, 1080]) chrome_options.add_argument(f"--window-size={window_size[0]},{window_size[1]}") # Add other options for stability chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-web-security") chrome_options.add_argument("--disable-features=VizDisplayCompositor") # Set user agent user_agent = self.config.get("scraper.user_agent", "") if user_agent: chrome_options.add_argument(f"--user-agent={user_agent}") # Initialize driver self.driver = webdriver.Chrome(options=chrome_options) self.driver.set_page_load_timeout(self.config.get("scraper.timeout", 30)) self.logger.info("Browser setup completed") except Exception as e: self.logger.error(f"Failed to setup browser: {str(e)}") raise def _setup_session(self) -> None: """ Set up the requests session with appropriate headers. """ user_agent = self.config.get("scraper.user_agent", "") if user_agent: self.session.headers.update({"User-Agent": user_agent}) # Add other headers self.session.headers.update({ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1" }) self.logger.info("Session setup completed") def scrape_market_trends(self) -> List[Dict[str, Any]]: """ Scrape market trends data from all configured sources. Returns: List of dictionaries containing scraped data """ all_data = [] sources = self.config.get("sources", []) for source in sources: if not source.get("enabled", True): self.logger.info(f"Skipping disabled source: {source.get('name', 'Unknown')}") continue self.logger.info(f"Scraping source: {source.get('name', 'Unknown')}") try: source_data = self._scrape_source(source) all_data.extend(source_data) self.logger.info(f"Scraped {len(source_data)} records from {source.get('name', 'Unknown')}") except Exception as e: self.logger.error(f"Failed to scrape source {source.get('name', 'Unknown')}: {str(e)}") continue return all_data def _scrape_source(self, source: Dict[str, Any]) -> List[Dict[str, Any]]: """ Scrape data from a specific source. Args: source: Source configuration dictionary Returns: List of dictionaries containing scraped data """ source_data = [] url = source.get("url", "") selectors = source.get("selectors", {}) pagination = source.get("pagination", {}) if not url: self.logger.warning(f"No URL configured for source: {source.get('name', 'Unknown')}") return source_data # Determine scraping method use_selenium = source.get("use_selenium", False) if use_selenium: return self._scrape_with_selenium(source) else: return self._scrape_with_requests(source) def _scrape_with_requests(self, source: Dict[str, Any]) -> List[Dict[str, Any]]: """ Scrape data using requests and BeautifulSoup. Args: source: Source configuration dictionary Returns: List of dictionaries containing scraped data """ source_data = [] url = source.get("url", "") selectors = source.get("selectors", {}) pagination = source.get("pagination", {}) max_pages = pagination.get("max_pages", 1) for page in range(1, max_pages + 1): try: # Add page parameter if needed page_url = url if page > 1: page_url = f"{url}?page={page}" self.logger.debug(f"Scraping page {page}: {page_url}") # Make request with retry logic response = self._make_request_with_retry(page_url) if not response: continue # Parse HTML soup = BeautifulSoup(response.text, 'html.parser') # Extract product data products = soup.select(selectors.get("product", "")) for product in products: product_data = self._extract_product_data(product, selectors) if product_data: product_data["source"] = source.get("name", "Unknown") product_data["scraped_at"] = datetime.now().isoformat() source_data.append(product_data) # Check if there's a next page if page < max_pages: next_page = soup.select_one(pagination.get("next_page", "")) if not next_page: self.logger.debug(f"No more pages found after page {page}") break # Delay between requests delay = self.config.get("scraper.delay_between_requests", 1.0) time.sleep(delay + random.uniform(0, 1)) except Exception as e: self.logger.error(f"Error scraping page {page} from {source.get('name', 'Unknown')}: {str(e)}") continue return source_data def _scrape_with_selenium(self, source: Dict[str, Any]) -> List[Dict[str, Any]]: """ Scrape data using Selenium WebDriver. Args: source: Source configuration dictionary Returns: List of dictionaries containing scraped data """ source_data = [] url = source.get("url", "") selectors = source.get("selectors", {}) pagination = source.get("pagination", {}) max_pages = pagination.get("max_pages", 1) try: self.driver.get(url) for page in range(1, max_pages + 1): self.logger.debug(f"Scraping page {page} with Selenium") # Wait for products to load try: WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.CSS_SELECTOR, selectors.get("product", ""))) ) except TimeoutException: self.logger.warning(f"Timeout waiting for products to load on page {page}") continue # Extract product data products = self.driver.find_elements(By.CSS_SELECTOR, selectors.get("product", "")) for product in products: product_data = self._extract_product_data_selenium(product, selectors) if product_data: product_data["source"] = source.get("name", "Unknown") product_data["scraped_at"] = datetime.now().isoformat() source_data.append(product_data) # Navigate to next page if available if page < max_pages: try: next_button = self.driver.find_element(By.CSS_SELECTOR, pagination.get("next_page", "")) next_button.click() # Wait for page to load time.sleep(2) except NoSuchElementException: self.logger.debug(f"No next page button found after page {page}") break # Delay between requests delay = self.config.get("scraper.delay_between_requests", 1.0) time.sleep(delay + random.uniform(0, 1)) except Exception as e: self.logger.error(f"Error scraping with Selenium from {source.get('name', 'Unknown')}: {str(e)}") return source_data def _make_request_with_retry(self, url: str) -> Optional[requests.Response]: """ Make HTTP request with retry logic. Args: url: URL to request Returns: Response object or None if failed """ max_retries = self.config.get("scraper.max_retries", 3) timeout = self.config.get("scraper.timeout", 30) for attempt in range(max_retries): try: response = self.session.get(url, timeout=timeout) response.raise_for_status() return response except requests.RequestException as e: self.logger.warning(f"Request attempt {attempt + 1} failed for {url}: {str(e)}") if attempt < max_retries - 1: # Exponential backoff time.sleep((2 ** attempt) + random.uniform(0, 1)) else: self.logger.error(f"Max retries exceeded for {url}") return None return None def _extract_product_data(self, product: BeautifulSoup, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]: """ Extract product data from HTML element using BeautifulSoup. Args: product: BeautifulSoup element containing product data selectors: Dictionary of CSS selectors Returns: Dictionary containing product data or None if extraction failed """ try: data = {} # Extract name name_element = product.select_one(selectors.get("name", "")) data["name"] = name_element.get_text(strip=True) if name_element else None # Extract price price_element = product.select_one(selectors.get("price", "")) if price_element: price_text = price_element.get_text(strip=True) data["price"] = self._parse_price(price_text) else: data["price"] = None # Extract rating rating_element = product.select_one(selectors.get("rating", "")) if rating_element: rating_text = rating_element.get_text(strip=True) data["rating"] = self._parse_rating(rating_text) else: data["rating"] = None # Extract availability availability_element = product.select_one(selectors.get("availability", "")) data["availability"] = availability_element.get_text(strip=True) if availability_element else None # Extract URL if available link_element = product.select_one("a") if link_element and link_element.get("href"): data["url"] = link_element.get("href") return data if data.get("name") else None except Exception as e: self.logger.error(f"Error extracting product data: {str(e)}") return None def _extract_product_data_selenium(self, product, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]: """ Extract product data from Selenium WebElement. Args: product: Selenium WebElement containing product data selectors: Dictionary of CSS selectors Returns: Dictionary containing product data or None if extraction failed """ try: data = {} # Extract name try: name_element = product.find_element(By.CSS_SELECTOR, selectors.get("name", "")) data["name"] = name_element.text.strip() except NoSuchElementException: data["name"] = None # Extract price try: price_element = product.find_element(By.CSS_SELECTOR, selectors.get("price", "")) price_text = price_element.text.strip() data["price"] = self._parse_price(price_text) except NoSuchElementException: data["price"] = None # Extract rating try: rating_element = product.find_element(By.CSS_SELECTOR, selectors.get("rating", "")) rating_text = rating_element.text.strip() data["rating"] = self._parse_rating(rating_text) except NoSuchElementException: data["rating"] = None # Extract availability try: availability_element = product.find_element(By.CSS_SELECTOR, selectors.get("availability", "")) data["availability"] = availability_element.text.strip() except NoSuchElementException: data["availability"] = None # Extract URL if available try: link_element = product.find_element(By.CSS_SELECTOR, "a") if link_element.get_attribute("href"): data["url"] = link_element.get_attribute("href") except NoSuchElementException: pass return data if data.get("name") else None except Exception as e: self.logger.error(f"Error extracting product data with Selenium: {str(e)}") return None def _parse_price(self, price_text: str) -> Optional[float]: """ Parse price text to extract numeric value. Args: price_text: Raw price text Returns: Parsed price as float or None if parsing failed """ try: # Remove currency symbols and whitespace price_clean = price_text.replace('$', '').replace('€', '').replace('£', '').strip() # Handle comma as decimal separator price_clean = price_clean.replace(',', '.') # Extract numeric part import re price_match = re.search(r'[\d.]+', price_clean) if price_match: return float(price_match.group()) return None except Exception as e: self.logger.error(f"Error parsing price '{price_text}': {str(e)}") return None def _parse_rating(self, rating_text: str) -> Optional[float]: """ Parse rating text to extract numeric value. Args: rating_text: Raw rating text Returns: Parsed rating as float or None if parsing failed """ try: # Extract numeric part import re rating_match = re.search(r'[\d.]+', rating_text) if rating_match: return float(rating_match.group()) return None except Exception as e: self.logger.error(f"Error parsing rating '{rating_text}': {str(e)}") return None def save_data(self, data: List[Dict[str, Any]], output_path: str) -> None: """ Save scraped data to file. Args: data: List of dictionaries containing scraped data output_path: Path to output file """ try: output_file = Path(output_path) output_file.parent.mkdir(parents=True, exist_ok=True) # Convert to DataFrame df = pd.DataFrame(data) # Save based on file extension if output_path.endswith('.csv'): df.to_csv(output_path, index=False) elif output_path.endswith('.json'): df.to_json(output_path, orient='records', indent=2) elif output_path.endswith('.xlsx'): df.to_excel(output_path, index=False) else: # Default to CSV output_path = output_path + '.csv' df.to_csv(output_path, index=False) self.logger.info(f"Data saved to {output_path}") except Exception as e: self.logger.error(f"Error saving data to {output_path}: {str(e)}") raise def analyze_trends(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: """ Analyze market trends from scraped data. Args: data: List of dictionaries containing scraped data Returns: Dictionary containing trend analysis results """ try: if not data: return {"error": "No data available for analysis"} # Convert to DataFrame df = pd.DataFrame(data) # Basic statistics analysis = { "total_products": len(df), "sources": df["source"].value_counts().to_dict(), "price_analysis": {}, "rating_analysis": {}, "availability_analysis": {} } # Price analysis if "price" in df.columns: price_data = df["price"].dropna() if not price_data.empty: analysis["price_analysis"] = { "average_price": float(price_data.mean()), "min_price": float(price_data.min()), "max_price": float(price_data.max()), "median_price": float(price_data.median()), "price_distribution": price_data.describe().to_dict() } # Rating analysis if "rating" in df.columns: rating_data = df["rating"].dropna() if not rating_data.empty: analysis["rating_analysis"] = { "average_rating": float(rating_data.mean()), "min_rating": float(rating_data.min()), "max_rating": float(rating_data.max()), "rating_distribution": rating_data.value_counts().to_dict() } # Availability analysis if "availability" in df.columns: availability_data = df["availability"].dropna() if not availability_data.empty: analysis["availability_analysis"] = availability_data.value_counts().to_dict() # Price trends by source if "price" in df.columns and "source" in df.columns: price_by_source = df.groupby("source")["price"].agg(["mean", "min", "max"]).to_dict() analysis["price_by_source"] = price_by_source self.logger.info("Trend analysis completed") return analysis except Exception as e: self.logger.error(f"Error analyzing trends: {str(e)}") return {"error": str(e)} def save_analysis(self, analysis: Dict[str, Any], output_path: str) -> None: """ Save trend analysis results to file. Args: analysis: Dictionary containing analysis results output_path: Path to output file """ try: output_file = Path(output_path) output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: json.dump(analysis, f, indent=2, ensure_ascii=False) self.logger.info(f"Analysis saved to {output_path}") except Exception as e: self.logger.error(f"Error saving analysis to {output_path}: {str(e)}") raise def close(self) -> None: """ Close browser and session. """ try: if self.driver: self.driver.quit() self.driver = None if self.session: self.session.close() self.logger.info("Browser and session closed") except Exception as e: self.logger.error(f"Error closing browser/session: {str(e)}") def __enter__(self): """Context manager entry.""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit.""" self.close()