Initial commit: Market Trends Scraper

2025-09-11 17:46:14 +03:00
commit 4ddcde68d4
17 changed files with 3049 additions and 0 deletions
--- a/examples/advanced_usage.py
+++ b/examples/advanced_usage.py
@@ -0,0 +1,327 @@
+
+#!/usr/bin/env python3
+"""
+Advanced Usage Example for Market Trends Scraper
+
+This script demonstrates advanced features of the Market Trends Scraper,
+including custom data processing, trend analysis, and visualization.
+"""
+
+import sys
+import json
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime, timedelta
+from pathlib import Path
+
+# Add src directory to Python path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
+
+from config_manager import ConfigManager
+from scraper import MarketTrendsScraper
+from logger import setup_logger
+
+
+def create_custom_config():
+    """Create a custom configuration for demonstration."""
+    return {
+        "scraper": {
+            "delay_between_requests": 2.0,
+            "timeout": 30,
+            "max_retries": 3,
+            "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            "headless": True,
+            "window_size": [1920, 1080]
+        },
+        "sources": [
+            {
+                "name": "books_store",
+                "url": "https://books.toscrape.com/",
+                "type": "ecommerce",
+                "enabled": True,
+                "use_selenium": False,
+                "selectors": {
+                    "product": "article.product_pod",
+                    "name": "h3 a",
+                    "price": "p.price_color",
+                    "rating": "p.star-rating",
+                    "availability": "p.instock.availability"
+                },
+                "pagination": {
+                    "next_page": "li.next a",
+                    "max_pages": 3
+                }
+            }
+        ],
+        "output": {
+            "format": "csv",
+            "include_timestamp": True,
+            "filename": "books_analysis"
+        },
+        "database": {
+            "url": "sqlite:///../data/books.db",
+            "echo": False
+        },
+        "analysis": {
+            "price_history_days": 30,
+            "trend_threshold": 0.05,
+            "generate_charts": True
+        }
+    }
+
+
+def custom_price_processing(price_text):
+    """
+    Custom price processing function for specific formats.
+    
+    Args:
+        price_text: Raw price text from website
+        
+    Returns:
+        Parsed price as float or None if parsing fails
+    """
+    try:
+        # Remove currency symbol and whitespace
+        price_clean = price_text.replace('£', '').strip()
+        
+        # Convert to float
+        return float(price_clean)
+    except (ValueError, TypeError):
+        return None
+
+
+def custom_rating_processing(rating_element):
+    """
+    Custom rating processing for star ratings.
+    
+    Args:
+        rating_element: BeautifulSoup element containing rating
+        
+    Returns:
+        Parsed rating as float or None if parsing fails
+    """
+    try:
+        # Extract rating from class name (e.g., "star-rating Three" -> 3)
+        class_names = rating_element.get('class', [])
+        for class_name in class_names:
+            if class_name.lower() in ['one', 'two', 'three', 'four', 'five']:
+                rating_map = {
+                    'one': 1.0,
+                    'two': 2.0,
+                    'three': 3.0,
+                    'four': 4.0,
+                    'five': 5.0
+                }
+                return rating_map.get(class_name.lower())
+        return None
+    except Exception:
+        return None
+
+
+def analyze_price_trends(data):
+    """
+    Perform advanced price trend analysis.
+    
+    Args:
+        data: List of product data dictionaries
+        
+    Returns:
+        Dictionary containing trend analysis results
+    """
+    if not data:
+        return {"error": "No data available for analysis"}
+    
+    # Convert to DataFrame
+    df = pd.DataFrame(data)
+    
+    # Ensure price is numeric
+    df['price'] = pd.to_numeric(df['price'], errors='coerce')
+    
+    # Drop rows with missing prices
+    df = df.dropna(subset=['price'])
+    
+    # Basic statistics
+    analysis = {
+        "total_products": len(df),
+        "price_statistics": {
+            "mean": float(df['price'].mean()),
+            "median": float(df['price'].median()),
+            "std": float(df['price'].std()),
+            "min": float(df['price'].min()),
+            "max": float(df['price'].max()),
+            "quartiles": {
+                "25%": float(df['price'].quantile(0.25)),
+                "75%": float(df['price'].quantile(0.75))
+            }
+        },
+        "price_distribution": {
+            "under_10": int(len(df[df['price'] < 10])),
+            "10_to_20": int(len(df[(df['price'] >= 10) & (df['price'] < 20)])),
+            "20_to_30": int(len(df[(df['price'] >= 20) & (df['price'] < 30)])),
+            "30_to_40": int(len(df[(df['price'] >= 30) & (df['price'] < 40)])),
+            "over_40": int(len(df[df['price'] >= 40]))
+        }
+    }
+    
+    # Rating analysis if available
+    if 'rating' in df.columns:
+        df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
+        rating_df = df.dropna(subset=['rating'])
+        
+        if not rating_df.empty:
+            analysis["rating_statistics"] = {
+                "mean": float(rating_df['rating'].mean()),
+                "distribution": rating_df['rating'].value_counts().to_dict()
+            }
+    
+    return analysis
+
+
+def generate_visualizations(data, analysis, output_dir):
+    """
+    Generate visualization charts for the analysis.
+    
+    Args:
+        data: List of product data dictionaries
+        analysis: Analysis results dictionary
+        output_dir: Directory to save charts
+    """
+    # Create output directory if it doesn't exist
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    
+    # Convert to DataFrame
+    df = pd.DataFrame(data)
+    
+    # Ensure price is numeric
+    df['price'] = pd.to_numeric(df['price'], errors='coerce')
+    df = df.dropna(subset=['price'])
+    
+    # Set style
+    sns.set(style="whitegrid")
+    plt.figure(figsize=(12, 8))
+    
+    # Price distribution histogram
+    plt.subplot(2, 2, 1)
+    sns.histplot(df['price'], bins=20, kde=True)
+    plt.title('Price Distribution')
+    plt.xlabel('Price (£)')
+    plt.ylabel('Count')
+    
+    # Price distribution by category
+    plt.subplot(2, 2, 2)
+    price_dist = analysis['price_distribution']
+    categories = list(price_dist.keys())
+    values = list(price_dist.values())
+    plt.bar(categories, values)
+    plt.title('Price Distribution by Category')
+    plt.xlabel('Price Category')
+    plt.ylabel('Count')
+    plt.xticks(rotation=45)
+    
+    # Box plot for prices
+    plt.subplot(2, 2, 3)
+    sns.boxplot(y=df['price'])
+    plt.title('Price Box Plot')
+    plt.ylabel('Price (£)')
+    
+    # Rating vs Price scatter plot (if ratings available)
+    plt.subplot(2, 2, 4)
+    if 'rating' in df.columns:
+        df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
+        rating_df = df.dropna(subset=['rating'])
+        if not rating_df.empty:
+            sns.scatterplot(x='rating', y='price', data=rating_df)
+            plt.title('Rating vs Price')
+            plt.xlabel('Rating')
+            plt.ylabel('Price (£)')
+        else:
+            plt.text(0.5, 0.5, 'No rating data available', 
+                    horizontalalignment='center', verticalalignment='center')
+    else:
+        plt.text(0.5, 0.5, 'No rating data available', 
+                horizontalalignment='center', verticalalignment='center')
+    
+    # Adjust layout and save
+    plt.tight_layout()
+    plt.savefig(f"{output_dir}/market_analysis_charts.png", dpi=300)
+    plt.close()
+    
+    print(f"✓ Visualization charts saved to {output_dir}/market_analysis_charts.png")
+
+
+def main():
+    """Main function demonstrating advanced scraper usage."""
+    
+    # Setup logging
+    setup_logger(verbose=True)
+    
+    # Create custom configuration
+    config = create_custom_config()
+    print("✓ Custom configuration created")
+    
+    # Initialize scraper
+    try:
+        scraper = MarketTrendsScraper(config, headless=True)
+        print("✓ Scraper initialized successfully")
+    except Exception as e:
+        print(f"✗ Failed to initialize scraper: {str(e)}")
+        return 1
+    
+    try:
+        # Scrape market trends data
+        print("\n🔍 Scraping market trends data...")
+        data = scraper.scrape_market_trends()
+        print(f"✓ Scraped {len(data)} product records")
+        
+        if not data:
+            print("⚠ No data was scraped. Check your configuration and selectors.")
+            return 0
+        
+        # Save raw data
+        output_file = "../data/advanced_example_output.csv"
+        scraper.save_data(data, output_file)
+        print(f"✓ Raw data saved to {output_file}")
+        
+        # Perform advanced analysis
+        print("\n📊 Performing advanced analysis...")
+        analysis = analyze_price_trends(data)
+        print("✓ Advanced analysis completed")
+        
+        # Save analysis results
+        analysis_file = "../data/advanced_example_analysis.json"
+        with open(analysis_file, 'w') as f:
+            json.dump(analysis, f, indent=2)
+        print(f"✓ Analysis saved to {analysis_file}")
+        
+        # Generate visualizations
+        print("\n📈 Generating visualization charts...")
+        charts_dir = "../data/charts"
+        generate_visualizations(data, analysis, charts_dir)
+        
+        # Print detailed summary
+        print("\n📋 Detailed Summary:")
+        print(f"  - Total products: {analysis.get('total_products', 0)}")
+        
+        if 'price_statistics' in analysis:
+            price_stats = analysis['price_statistics']
+            print(f"  - Average price: £{price_stats.get('mean', 0):.2f}")
+            print(f"  - Median price: £{price_stats.get('median', 0):.2f}")
+            print(f"  - Standard deviation: £{price_stats.get('std', 0):.2f}")
+            print(f"  - Price range: £{price_stats.get('min', 0):.2f} - £{price_stats.get('max', 0):.2f}")
+        
+        if 'price_distribution' in analysis:
+            print("  - Price distribution:")
+            for category, count in analysis['price_distribution'].items():
+                print(f"    * {category}: {count} products")
+        
+        if 'rating_statistics' in analysis:
+            rating_stats = analysis['rating_statistics']
+            print(f"  - Average rating: {rating_stats.get('mean', 0):.2f}")
+            print("  - Rating distribution:")
+            for rating, count in rating_stats['distribution'].items():
+                print(f"    * {rating} stars: {count} products")
+        
+        print("\n✅ Advanced market trends analysis completed successfully!")
+        return 0
+