trends-scraper/src/scraper.py

"""
Web Scraper Module

This module contains the core functionality for scraping e-commerce websites
to collect product and pricing data for market trend analysis.
"""

import time
import json
import random
from datetime import datetime
from typing import Dict, List, Any, Optional, Union
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException

from logger import LoggerMixin
from config_manager import ConfigManager


class MarketTrendsScraper(LoggerMixin):
    """
    Main scraper class for collecting market trends data from e-commerce websites.
    """

    def __init__(self, config: Dict[str, Any], headless: bool = True):
        """
        Initialize the scraper with configuration.

        Args:
            config: Configuration dictionary
            headless: Whether to run browser in headless mode
        """
        self.config = config
        self.driver = None
        self.session = requests.Session()
        self.data = []
        self._setup_browser(headless)
        self._setup_session()

    def _setup_browser(self, headless: bool = True) -> None:
        """
        Set up the Selenium WebDriver with appropriate options.

        Args:
            headless: Whether to run browser in headless mode
        """
        try:
            chrome_options = Options()

            # Set headless mode
            if headless:
                chrome_options.add_argument("--headless")

            # Set window size
            window_size = self.config.get("scraper.window_size", [1920, 1080])
            chrome_options.add_argument(f"--window-size={window_size[0]},{window_size[1]}")

            # Add other options for stability
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--disable-web-security")
            chrome_options.add_argument("--disable-features=VizDisplayCompositor")

            # Set user agent
            user_agent = self.config.get("scraper.user_agent", "")
            if user_agent:
                chrome_options.add_argument(f"--user-agent={user_agent}")

            # Initialize driver
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.set_page_load_timeout(self.config.get("scraper.timeout", 30))

            self.logger.info("Browser setup completed")

        except Exception as e:
            self.logger.error(f"Failed to setup browser: {str(e)}")
            raise

    def _setup_session(self) -> None:
        """
        Set up the requests session with appropriate headers.
        """
        user_agent = self.config.get("scraper.user_agent", "")
        if user_agent:
            self.session.headers.update({"User-Agent": user_agent})

        # Add other headers
        self.session.headers.update({
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1"
        })

        self.logger.info("Session setup completed")

    def scrape_market_trends(self) -> List[Dict[str, Any]]:
        """
        Scrape market trends data from all configured sources.

        Returns:
            List of dictionaries containing scraped data
        """
        all_data = []
        sources = self.config.get("sources", [])

        for source in sources:
            if not source.get("enabled", True):
                self.logger.info(f"Skipping disabled source: {source.get('name', 'Unknown')}")
                continue

            self.logger.info(f"Scraping source: {source.get('name', 'Unknown')}")

            try:
                source_data = self._scrape_source(source)
                all_data.extend(source_data)
                self.logger.info(f"Scraped {len(source_data)} records from {source.get('name', 'Unknown')}")
            except Exception as e:
                self.logger.error(f"Failed to scrape source {source.get('name', 'Unknown')}: {str(e)}")
                continue

        return all_data

    def _scrape_source(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Scrape data from a specific source.

        Args:
            source: Source configuration dictionary

        Returns:
            List of dictionaries containing scraped data
        """
        source_data = []
        url = source.get("url", "")
        selectors = source.get("selectors", {})
        pagination = source.get("pagination", {})

        if not url:
            self.logger.warning(f"No URL configured for source: {source.get('name', 'Unknown')}")
            return source_data

        # Determine scraping method
        use_selenium = source.get("use_selenium", False)

        if use_selenium:
            return self._scrape_with_selenium(source)
        else:
            return self._scrape_with_requests(source)

    def _scrape_with_requests(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Scrape data using requests and BeautifulSoup.

        Args:
            source: Source configuration dictionary

        Returns:
            List of dictionaries containing scraped data
        """
        source_data = []
        url = source.get("url", "")
        selectors = source.get("selectors", {})
        pagination = source.get("pagination", {})
        max_pages = pagination.get("max_pages", 1)

        for page in range(1, max_pages + 1):
            try:
                # Add page parameter if needed
                page_url = url
                if page > 1:
                    page_url = f"{url}?page={page}"

                self.logger.debug(f"Scraping page {page}: {page_url}")

                # Make request with retry logic
                response = self._make_request_with_retry(page_url)

                if not response:
                    continue

                # Parse HTML
                soup = BeautifulSoup(response.text, 'html.parser')

                # Extract product data
                products = soup.select(selectors.get("product", ""))

                for product in products:
                    product_data = self._extract_product_data(product, selectors)
                    if product_data:
                        product_data["source"] = source.get("name", "Unknown")
                        product_data["scraped_at"] = datetime.now().isoformat()
                        source_data.append(product_data)

                # Check if there's a next page
                if page < max_pages:
                    next_page = soup.select_one(pagination.get("next_page", ""))
                    if not next_page:
                        self.logger.debug(f"No more pages found after page {page}")
                        break

                # Delay between requests
                delay = self.config.get("scraper.delay_between_requests", 1.0)
                time.sleep(delay + random.uniform(0, 1))

            except Exception as e:
                self.logger.error(f"Error scraping page {page} from {source.get('name', 'Unknown')}: {str(e)}")
                continue

        return source_data

    def _scrape_with_selenium(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Scrape data using Selenium WebDriver.

        Args:
            source: Source configuration dictionary

        Returns:
            List of dictionaries containing scraped data
        """
        source_data = []
        url = source.get("url", "")
        selectors = source.get("selectors", {})
        pagination = source.get("pagination", {})
        max_pages = pagination.get("max_pages", 1)

        try:
            self.driver.get(url)

            for page in range(1, max_pages + 1):
                self.logger.debug(f"Scraping page {page} with Selenium")

                # Wait for products to load
                try:
                    WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.CSS_SELECTOR, selectors.get("product", "")))
                    )
                except TimeoutException:
                    self.logger.warning(f"Timeout waiting for products to load on page {page}")
                    continue

                # Extract product data
                products = self.driver.find_elements(By.CSS_SELECTOR, selectors.get("product", ""))

                for product in products:
                    product_data = self._extract_product_data_selenium(product, selectors)
                    if product_data:
                        product_data["source"] = source.get("name", "Unknown")
                        product_data["scraped_at"] = datetime.now().isoformat()
                        source_data.append(product_data)

                # Navigate to next page if available
                if page < max_pages:
                    try:
                        next_button = self.driver.find_element(By.CSS_SELECTOR, pagination.get("next_page", ""))
                        next_button.click()

                        # Wait for page to load
                        time.sleep(2)
                    except NoSuchElementException:
                        self.logger.debug(f"No next page button found after page {page}")
                        break

                # Delay between requests
                delay = self.config.get("scraper.delay_between_requests", 1.0)
                time.sleep(delay + random.uniform(0, 1))

        except Exception as e:
            self.logger.error(f"Error scraping with Selenium from {source.get('name', 'Unknown')}: {str(e)}")

        return source_data

    def _make_request_with_retry(self, url: str) -> Optional[requests.Response]:
        """
        Make HTTP request with retry logic.

        Args:
            url: URL to request

        Returns:
            Response object or None if failed
        """
        max_retries = self.config.get("scraper.max_retries", 3)
        timeout = self.config.get("scraper.timeout", 30)

        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=timeout)
                response.raise_for_status()
                return response
            except requests.RequestException as e:
                self.logger.warning(f"Request attempt {attempt + 1} failed for {url}: {str(e)}")
                if attempt < max_retries - 1:
                    # Exponential backoff
                    time.sleep((2 ** attempt) + random.uniform(0, 1))
                else:
                    self.logger.error(f"Max retries exceeded for {url}")
                    return None

        return None

    def _extract_product_data(self, product: BeautifulSoup, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
        """
        Extract product data from HTML element using BeautifulSoup.

        Args:
            product: BeautifulSoup element containing product data
            selectors: Dictionary of CSS selectors

        Returns:
            Dictionary containing product data or None if extraction failed
        """
        try:
            data = {}

            # Extract name
            name_element = product.select_one(selectors.get("name", ""))
            data["name"] = name_element.get_text(strip=True) if name_element else None

            # Extract price
            price_element = product.select_one(selectors.get("price", ""))
            if price_element:
                price_text = price_element.get_text(strip=True)
                data["price"] = self._parse_price(price_text)
            else:
                data["price"] = None

            # Extract rating
            rating_element = product.select_one(selectors.get("rating", ""))
            if rating_element:
                rating_text = rating_element.get_text(strip=True)
                data["rating"] = self._parse_rating(rating_text)
            else:
                data["rating"] = None

            # Extract availability
            availability_element = product.select_one(selectors.get("availability", ""))
            data["availability"] = availability_element.get_text(strip=True) if availability_element else None

            # Extract URL if available
            link_element = product.select_one("a")
            if link_element and link_element.get("href"):
                data["url"] = link_element.get("href")

            return data if data.get("name") else None

        except Exception as e:
            self.logger.error(f"Error extracting product data: {str(e)}")
            return None

    def _extract_product_data_selenium(self, product, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
        """
        Extract product data from Selenium WebElement.

        Args:
            product: Selenium WebElement containing product data
            selectors: Dictionary of CSS selectors

        Returns:
            Dictionary containing product data or None if extraction failed
        """
        try:
            data = {}

            # Extract name
            try:
                name_element = product.find_element(By.CSS_SELECTOR, selectors.get("name", ""))
                data["name"] = name_element.text.strip()
            except NoSuchElementException:
                data["name"] = None

            # Extract price
            try:
                price_element = product.find_element(By.CSS_SELECTOR, selectors.get("price", ""))
                price_text = price_element.text.strip()
                data["price"] = self._parse_price(price_text)
            except NoSuchElementException:
                data["price"] = None

            # Extract rating
            try:
                rating_element = product.find_element(By.CSS_SELECTOR, selectors.get("rating", ""))
                rating_text = rating_element.text.strip()
                data["rating"] = self._parse_rating(rating_text)
            except NoSuchElementException:
                data["rating"] = None

            # Extract availability
            try:
                availability_element = product.find_element(By.CSS_SELECTOR, selectors.get("availability", ""))
                data["availability"] = availability_element.text.strip()
            except NoSuchElementException:
                data["availability"] = None

            # Extract URL if available
            try:
                link_element = product.find_element(By.CSS_SELECTOR, "a")
                if link_element.get_attribute("href"):
                    data["url"] = link_element.get_attribute("href")
            except NoSuchElementException:
                pass

            return data if data.get("name") else None

        except Exception as e:
            self.logger.error(f"Error extracting product data with Selenium: {str(e)}")
            return None

    def _parse_price(self, price_text: str) -> Optional[float]:
        """
        Parse price text to extract numeric value.

        Args:
            price_text: Raw price text

        Returns:
            Parsed price as float or None if parsing failed
        """
        try:
            # Remove currency symbols and whitespace
            price_clean = price_text.replace('$', '').replace('€', '').replace('£', '').strip()

            # Handle comma as decimal separator
            price_clean = price_clean.replace(',', '.')

            # Extract numeric part
            import re
            price_match = re.search(r'[\d.]+', price_clean)
            if price_match:
                return float(price_match.group())

            return None

        except Exception as e:
            self.logger.error(f"Error parsing price '{price_text}': {str(e)}")
            return None

    def _parse_rating(self, rating_text: str) -> Optional[float]:
        """
        Parse rating text to extract numeric value.

        Args:
            rating_text: Raw rating text

        Returns:
            Parsed rating as float or None if parsing failed
        """
        try:
            # Extract numeric part
            import re
            rating_match = re.search(r'[\d.]+', rating_text)
            if rating_match:
                return float(rating_match.group())

            return None

        except Exception as e:
            self.logger.error(f"Error parsing rating '{rating_text}': {str(e)}")
            return None

    def save_data(self, data: List[Dict[str, Any]], output_path: str) -> None:
        """
        Save scraped data to file.

        Args:
            data: List of dictionaries containing scraped data
            output_path: Path to output file
        """
        try:
            output_file = Path(output_path)
            output_file.parent.mkdir(parents=True, exist_ok=True)

            # Convert to DataFrame
            df = pd.DataFrame(data)

            # Save based on file extension
            if output_path.endswith('.csv'):
                df.to_csv(output_path, index=False)
            elif output_path.endswith('.json'):
                df.to_json(output_path, orient='records', indent=2)
            elif output_path.endswith('.xlsx'):
                df.to_excel(output_path, index=False)
            else:
                # Default to CSV
                output_path = output_path + '.csv'
                df.to_csv(output_path, index=False)

            self.logger.info(f"Data saved to {output_path}")

        except Exception as e:
            self.logger.error(f"Error saving data to {output_path}: {str(e)}")
            raise

    def analyze_trends(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Analyze market trends from scraped data.

        Args:
            data: List of dictionaries containing scraped data

        Returns:
            Dictionary containing trend analysis results
        """
        try:
            if not data:
                return {"error": "No data available for analysis"}

            # Convert to DataFrame
            df = pd.DataFrame(data)

            # Basic statistics
            analysis = {
                "total_products": len(df),
                "sources": df["source"].value_counts().to_dict(),
                "price_analysis": {},
                "rating_analysis": {},
                "availability_analysis": {}
            }

            # Price analysis
            if "price" in df.columns:
                price_data = df["price"].dropna()
                if not price_data.empty:
                    analysis["price_analysis"] = {
                        "average_price": float(price_data.mean()),
                        "min_price": float(price_data.min()),
                        "max_price": float(price_data.max()),
                        "median_price": float(price_data.median()),
                        "price_distribution": price_data.describe().to_dict()
                    }

            # Rating analysis
            if "rating" in df.columns:
                rating_data = df["rating"].dropna()
                if not rating_data.empty:
                    analysis["rating_analysis"] = {
                        "average_rating": float(rating_data.mean()),
                        "min_rating": float(rating_data.min()),
                        "max_rating": float(rating_data.max()),
                        "rating_distribution": rating_data.value_counts().to_dict()
                    }

            # Availability analysis
            if "availability" in df.columns:
                availability_data = df["availability"].dropna()
                if not availability_data.empty:
                    analysis["availability_analysis"] = availability_data.value_counts().to_dict()

            # Price trends by source
            if "price" in df.columns and "source" in df.columns:
                price_by_source = df.groupby("source")["price"].agg(["mean", "min", "max"]).to_dict()
                analysis["price_by_source"] = price_by_source

            self.logger.info("Trend analysis completed")
            return analysis

        except Exception as e:
            self.logger.error(f"Error analyzing trends: {str(e)}")
            return {"error": str(e)}

    def save_analysis(self, analysis: Dict[str, Any], output_path: str) -> None:
        """
        Save trend analysis results to file.

        Args:
            analysis: Dictionary containing analysis results
            output_path: Path to output file
        """
        try:
            output_file = Path(output_path)
            output_file.parent.mkdir(parents=True, exist_ok=True)

            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(analysis, f, indent=2, ensure_ascii=False)

            self.logger.info(f"Analysis saved to {output_path}")

        except Exception as e:
            self.logger.error(f"Error saving analysis to {output_path}: {str(e)}")
            raise

    def close(self) -> None:
        """
        Close browser and session.
        """
        try:
            if self.driver:
                self.driver.quit()
                self.driver = None

            if self.session:
                self.session.close()

            self.logger.info("Browser and session closed")

        except Exception as e:
            self.logger.error(f"Error closing browser/session: {str(e)}")

    def __enter__(self):
        """Context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit."""
        self.close()