Initial commit: Market Trends Scraper

2025-09-11 17:46:14 +03:00
commit 4ddcde68d4
17 changed files with 3049 additions and 0 deletions
--- a/src/scraper.py
+++ b/src/scraper.py
@@ -0,0 +1,618 @@
+"""
+Web Scraper Module
+
+This module contains the core functionality for scraping e-commerce websites
+to collect product and pricing data for market trend analysis.
+"""
+
+import time
+import json
+import random
+from datetime import datetime
+from typing import Dict, List, Any, Optional, Union
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.common.exceptions import TimeoutException, NoSuchElementException
+
+from logger import LoggerMixin
+from config_manager import ConfigManager
+
+
+class MarketTrendsScraper(LoggerMixin):
+    """
+    Main scraper class for collecting market trends data from e-commerce websites.
+    """
+    
+    def __init__(self, config: Dict[str, Any], headless: bool = True):
+        """
+        Initialize the scraper with configuration.
+        
+        Args:
+            config: Configuration dictionary
+            headless: Whether to run browser in headless mode
+        """
+        self.config = config
+        self.driver = None
+        self.session = requests.Session()
+        self.data = []
+        self._setup_browser(headless)
+        self._setup_session()
+    
+    def _setup_browser(self, headless: bool = True) -> None:
+        """
+        Set up the Selenium WebDriver with appropriate options.
+        
+        Args:
+            headless: Whether to run browser in headless mode
+        """
+        try:
+            chrome_options = Options()
+            
+            # Set headless mode
+            if headless:
+                chrome_options.add_argument("--headless")
+            
+            # Set window size
+            window_size = self.config.get("scraper.window_size", [1920, 1080])
+            chrome_options.add_argument(f"--window-size={window_size[0]},{window_size[1]}")
+            
+            # Add other options for stability
+            chrome_options.add_argument("--no-sandbox")
+            chrome_options.add_argument("--disable-dev-shm-usage")
+            chrome_options.add_argument("--disable-gpu")
+            chrome_options.add_argument("--disable-web-security")
+            chrome_options.add_argument("--disable-features=VizDisplayCompositor")
+            
+            # Set user agent
+            user_agent = self.config.get("scraper.user_agent", "")
+            if user_agent:
+                chrome_options.add_argument(f"--user-agent={user_agent}")
+            
+            # Initialize driver
+            self.driver = webdriver.Chrome(options=chrome_options)
+            self.driver.set_page_load_timeout(self.config.get("scraper.timeout", 30))
+            
+            self.logger.info("Browser setup completed")
+            
+        except Exception as e:
+            self.logger.error(f"Failed to setup browser: {str(e)}")
+            raise
+    
+    def _setup_session(self) -> None:
+        """
+        Set up the requests session with appropriate headers.
+        """
+        user_agent = self.config.get("scraper.user_agent", "")
+        if user_agent:
+            self.session.headers.update({"User-Agent": user_agent})
+        
+        # Add other headers
+        self.session.headers.update({
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Accept-Encoding": "gzip, deflate",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1"
+        })
+        
+        self.logger.info("Session setup completed")
+    
+    def scrape_market_trends(self) -> List[Dict[str, Any]]:
+        """
+        Scrape market trends data from all configured sources.
+        
+        Returns:
+            List of dictionaries containing scraped data
+        """
+        all_data = []
+        sources = self.config.get("sources", [])
+        
+        for source in sources:
+            if not source.get("enabled", True):
+                self.logger.info(f"Skipping disabled source: {source.get('name', 'Unknown')}")
+                continue
+            
+            self.logger.info(f"Scraping source: {source.get('name', 'Unknown')}")
+            
+            try:
+                source_data = self._scrape_source(source)
+                all_data.extend(source_data)
+                self.logger.info(f"Scraped {len(source_data)} records from {source.get('name', 'Unknown')}")
+            except Exception as e:
+                self.logger.error(f"Failed to scrape source {source.get('name', 'Unknown')}: {str(e)}")
+                continue
+        
+        return all_data
+    
+    def _scrape_source(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Scrape data from a specific source.
+        
+        Args:
+            source: Source configuration dictionary
+            
+        Returns:
+            List of dictionaries containing scraped data
+        """
+        source_data = []
+        url = source.get("url", "")
+        selectors = source.get("selectors", {})
+        pagination = source.get("pagination", {})
+        
+        if not url:
+            self.logger.warning(f"No URL configured for source: {source.get('name', 'Unknown')}")
+            return source_data
+        
+        # Determine scraping method
+        use_selenium = source.get("use_selenium", False)
+        
+        if use_selenium:
+            return self._scrape_with_selenium(source)
+        else:
+            return self._scrape_with_requests(source)
+    
+    def _scrape_with_requests(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Scrape data using requests and BeautifulSoup.
+        
+        Args:
+            source: Source configuration dictionary
+            
+        Returns:
+            List of dictionaries containing scraped data
+        """
+        source_data = []
+        url = source.get("url", "")
+        selectors = source.get("selectors", {})
+        pagination = source.get("pagination", {})
+        max_pages = pagination.get("max_pages", 1)
+        
+        for page in range(1, max_pages + 1):
+            try:
+                # Add page parameter if needed
+                page_url = url
+                if page > 1:
+                    page_url = f"{url}?page={page}"
+                
+                self.logger.debug(f"Scraping page {page}: {page_url}")
+                
+                # Make request with retry logic
+                response = self._make_request_with_retry(page_url)
+                
+                if not response:
+                    continue
+                
+                # Parse HTML
+                soup = BeautifulSoup(response.text, 'html.parser')
+                
+                # Extract product data
+                products = soup.select(selectors.get("product", ""))
+                
+                for product in products:
+                    product_data = self._extract_product_data(product, selectors)
+                    if product_data:
+                        product_data["source"] = source.get("name", "Unknown")
+                        product_data["scraped_at"] = datetime.now().isoformat()
+                        source_data.append(product_data)
+                
+                # Check if there's a next page
+                if page < max_pages:
+                    next_page = soup.select_one(pagination.get("next_page", ""))
+                    if not next_page:
+                        self.logger.debug(f"No more pages found after page {page}")
+                        break
+                
+                # Delay between requests
+                delay = self.config.get("scraper.delay_between_requests", 1.0)
+                time.sleep(delay + random.uniform(0, 1))
+                
+            except Exception as e:
+                self.logger.error(f"Error scraping page {page} from {source.get('name', 'Unknown')}: {str(e)}")
+                continue
+        
+        return source_data
+    
+    def _scrape_with_selenium(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Scrape data using Selenium WebDriver.
+        
+        Args:
+            source: Source configuration dictionary
+            
+        Returns:
+            List of dictionaries containing scraped data
+        """
+        source_data = []
+        url = source.get("url", "")
+        selectors = source.get("selectors", {})
+        pagination = source.get("pagination", {})
+        max_pages = pagination.get("max_pages", 1)
+        
+        try:
+            self.driver.get(url)
+            
+            for page in range(1, max_pages + 1):
+                self.logger.debug(f"Scraping page {page} with Selenium")
+                
+                # Wait for products to load
+                try:
+                    WebDriverWait(self.driver, 10).until(
+                        EC.presence_of_element_located((By.CSS_SELECTOR, selectors.get("product", "")))
+                    )
+                except TimeoutException:
+                    self.logger.warning(f"Timeout waiting for products to load on page {page}")
+                    continue
+                
+                # Extract product data
+                products = self.driver.find_elements(By.CSS_SELECTOR, selectors.get("product", ""))
+                
+                for product in products:
+                    product_data = self._extract_product_data_selenium(product, selectors)
+                    if product_data:
+                        product_data["source"] = source.get("name", "Unknown")
+                        product_data["scraped_at"] = datetime.now().isoformat()
+                        source_data.append(product_data)
+                
+                # Navigate to next page if available
+                if page < max_pages:
+                    try:
+                        next_button = self.driver.find_element(By.CSS_SELECTOR, pagination.get("next_page", ""))
+                        next_button.click()
+                        
+                        # Wait for page to load
+                        time.sleep(2)
+                    except NoSuchElementException:
+                        self.logger.debug(f"No next page button found after page {page}")
+                        break
+                
+                # Delay between requests
+                delay = self.config.get("scraper.delay_between_requests", 1.0)
+                time.sleep(delay + random.uniform(0, 1))
+                
+        except Exception as e:
+            self.logger.error(f"Error scraping with Selenium from {source.get('name', 'Unknown')}: {str(e)}")
+        
+        return source_data
+    
+    def _make_request_with_retry(self, url: str) -> Optional[requests.Response]:
+        """
+        Make HTTP request with retry logic.
+        
+        Args:
+            url: URL to request
+            
+        Returns:
+            Response object or None if failed
+        """
+        max_retries = self.config.get("scraper.max_retries", 3)
+        timeout = self.config.get("scraper.timeout", 30)
+        
+        for attempt in range(max_retries):
+            try:
+                response = self.session.get(url, timeout=timeout)
+                response.raise_for_status()
+                return response
+            except requests.RequestException as e:
+                self.logger.warning(f"Request attempt {attempt + 1} failed for {url}: {str(e)}")
+                if attempt < max_retries - 1:
+                    # Exponential backoff
+                    time.sleep((2 ** attempt) + random.uniform(0, 1))
+                else:
+                    self.logger.error(f"Max retries exceeded for {url}")
+                    return None
+        
+        return None
+    
+    def _extract_product_data(self, product: BeautifulSoup, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
+        """
+        Extract product data from HTML element using BeautifulSoup.
+        
+        Args:
+            product: BeautifulSoup element containing product data
+            selectors: Dictionary of CSS selectors
+            
+        Returns:
+            Dictionary containing product data or None if extraction failed
+        """
+        try:
+            data = {}
+            
+            # Extract name
+            name_element = product.select_one(selectors.get("name", ""))
+            data["name"] = name_element.get_text(strip=True) if name_element else None
+            
+            # Extract price
+            price_element = product.select_one(selectors.get("price", ""))
+            if price_element:
+                price_text = price_element.get_text(strip=True)
+                data["price"] = self._parse_price(price_text)
+            else:
+                data["price"] = None
+            
+            # Extract rating
+            rating_element = product.select_one(selectors.get("rating", ""))
+            if rating_element:
+                rating_text = rating_element.get_text(strip=True)
+                data["rating"] = self._parse_rating(rating_text)
+            else:
+                data["rating"] = None
+            
+            # Extract availability
+            availability_element = product.select_one(selectors.get("availability", ""))
+            data["availability"] = availability_element.get_text(strip=True) if availability_element else None
+            
+            # Extract URL if available
+            link_element = product.select_one("a")
+            if link_element and link_element.get("href"):
+                data["url"] = link_element.get("href")
+            
+            return data if data.get("name") else None
+            
+        except Exception as e:
+            self.logger.error(f"Error extracting product data: {str(e)}")
+            return None
+    
+    def _extract_product_data_selenium(self, product, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
+        """
+        Extract product data from Selenium WebElement.
+        
+        Args:
+            product: Selenium WebElement containing product data
+            selectors: Dictionary of CSS selectors
+            
+        Returns:
+            Dictionary containing product data or None if extraction failed
+        """
+        try:
+            data = {}
+            
+            # Extract name
+            try:
+                name_element = product.find_element(By.CSS_SELECTOR, selectors.get("name", ""))
+                data["name"] = name_element.text.strip()
+            except NoSuchElementException:
+                data["name"] = None
+            
+            # Extract price
+            try:
+                price_element = product.find_element(By.CSS_SELECTOR, selectors.get("price", ""))
+                price_text = price_element.text.strip()
+                data["price"] = self._parse_price(price_text)
+            except NoSuchElementException:
+                data["price"] = None
+            
+            # Extract rating
+            try:
+                rating_element = product.find_element(By.CSS_SELECTOR, selectors.get("rating", ""))
+                rating_text = rating_element.text.strip()
+                data["rating"] = self._parse_rating(rating_text)
+            except NoSuchElementException:
+                data["rating"] = None
+            
+            # Extract availability
+            try:
+                availability_element = product.find_element(By.CSS_SELECTOR, selectors.get("availability", ""))
+                data["availability"] = availability_element.text.strip()
+            except NoSuchElementException:
+                data["availability"] = None
+            
+            # Extract URL if available
+            try:
+                link_element = product.find_element(By.CSS_SELECTOR, "a")
+                if link_element.get_attribute("href"):
+                    data["url"] = link_element.get_attribute("href")
+            except NoSuchElementException:
+                pass
+            
+            return data if data.get("name") else None
+            
+        except Exception as e:
+            self.logger.error(f"Error extracting product data with Selenium: {str(e)}")
+            return None
+    
+    def _parse_price(self, price_text: str) -> Optional[float]:
+        """
+        Parse price text to extract numeric value.
+        
+        Args:
+            price_text: Raw price text
+            
+        Returns:
+            Parsed price as float or None if parsing failed
+        """
+        try:
+            # Remove currency symbols and whitespace
+            price_clean = price_text.replace('$', '').replace('€', '').replace('£', '').strip()
+            
+            # Handle comma as decimal separator
+            price_clean = price_clean.replace(',', '.')
+            
+            # Extract numeric part
+            import re
+            price_match = re.search(r'[\d.]+', price_clean)
+            if price_match:
+                return float(price_match.group())
+            
+            return None
+            
+        except Exception as e:
+            self.logger.error(f"Error parsing price '{price_text}': {str(e)}")
+            return None
+    
+    def _parse_rating(self, rating_text: str) -> Optional[float]:
+        """
+        Parse rating text to extract numeric value.
+        
+        Args:
+            rating_text: Raw rating text
+            
+        Returns:
+            Parsed rating as float or None if parsing failed
+        """
+        try:
+            # Extract numeric part
+            import re
+            rating_match = re.search(r'[\d.]+', rating_text)
+            if rating_match:
+                return float(rating_match.group())
+            
+            return None
+            
+        except Exception as e:
+            self.logger.error(f"Error parsing rating '{rating_text}': {str(e)}")
+            return None
+    
+    def save_data(self, data: List[Dict[str, Any]], output_path: str) -> None:
+        """
+        Save scraped data to file.
+        
+        Args:
+            data: List of dictionaries containing scraped data
+            output_path: Path to output file
+        """
+        try:
+            output_file = Path(output_path)
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            
+            # Convert to DataFrame
+            df = pd.DataFrame(data)
+            
+            # Save based on file extension
+            if output_path.endswith('.csv'):
+                df.to_csv(output_path, index=False)
+            elif output_path.endswith('.json'):
+                df.to_json(output_path, orient='records', indent=2)
+            elif output_path.endswith('.xlsx'):
+                df.to_excel(output_path, index=False)
+            else:
+                # Default to CSV
+                output_path = output_path + '.csv'
+                df.to_csv(output_path, index=False)
+            
+            self.logger.info(f"Data saved to {output_path}")
+            
+        except Exception as e:
+            self.logger.error(f"Error saving data to {output_path}: {str(e)}")
+            raise
+    
+    def analyze_trends(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Analyze market trends from scraped data.
+        
+        Args:
+            data: List of dictionaries containing scraped data
+            
+        Returns:
+            Dictionary containing trend analysis results
+        """
+        try:
+            if not data:
+                return {"error": "No data available for analysis"}
+            
+            # Convert to DataFrame
+            df = pd.DataFrame(data)
+            
+            # Basic statistics
+            analysis = {
+                "total_products": len(df),
+                "sources": df["source"].value_counts().to_dict(),
+                "price_analysis": {},
+                "rating_analysis": {},
+                "availability_analysis": {}
+            }
+            
+            # Price analysis
+            if "price" in df.columns:
+                price_data = df["price"].dropna()
+                if not price_data.empty:
+                    analysis["price_analysis"] = {
+                        "average_price": float(price_data.mean()),
+                        "min_price": float(price_data.min()),
+                        "max_price": float(price_data.max()),
+                        "median_price": float(price_data.median()),
+                        "price_distribution": price_data.describe().to_dict()
+                    }
+            
+            # Rating analysis
+            if "rating" in df.columns:
+                rating_data = df["rating"].dropna()
+                if not rating_data.empty:
+                    analysis["rating_analysis"] = {
+                        "average_rating": float(rating_data.mean()),
+                        "min_rating": float(rating_data.min()),
+                        "max_rating": float(rating_data.max()),
+                        "rating_distribution": rating_data.value_counts().to_dict()
+                    }
+            
+            # Availability analysis
+            if "availability" in df.columns:
+                availability_data = df["availability"].dropna()
+                if not availability_data.empty:
+                    analysis["availability_analysis"] = availability_data.value_counts().to_dict()
+            
+            # Price trends by source
+            if "price" in df.columns and "source" in df.columns:
+                price_by_source = df.groupby("source")["price"].agg(["mean", "min", "max"]).to_dict()
+                analysis["price_by_source"] = price_by_source
+            
+            self.logger.info("Trend analysis completed")
+            return analysis
+            
+        except Exception as e:
+            self.logger.error(f"Error analyzing trends: {str(e)}")
+            return {"error": str(e)}
+    
+    def save_analysis(self, analysis: Dict[str, Any], output_path: str) -> None:
+        """
+        Save trend analysis results to file.
+        
+        Args:
+            analysis: Dictionary containing analysis results
+            output_path: Path to output file
+        """
+        try:
+            output_file = Path(output_path)
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            
+            with open(output_file, 'w', encoding='utf-8') as f:
+                json.dump(analysis, f, indent=2, ensure_ascii=False)
+            
+            self.logger.info(f"Analysis saved to {output_path}")
+            
+        except Exception as e:
+            self.logger.error(f"Error saving analysis to {output_path}: {str(e)}")
+            raise
+    
+    def close(self) -> None:
+        """
+        Close browser and session.
+        """
+        try:
+            if self.driver:
+                self.driver.quit()
+                self.driver = None
+            
+            if self.session:
+                self.session.close()
+            
+            self.logger.info("Browser and session closed")
+            
+        except Exception as e:
+            self.logger.error(f"Error closing browser/session: {str(e)}")
+    
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.close()