#!/usr/bin/env python3 """ Advanced Usage Example for Market Trends Scraper This script demonstrates advanced features of the Market Trends Scraper, including custom data processing, trend analysis, and visualization. """ import sys import json import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime, timedelta from pathlib import Path # Add src directory to Python path sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) from config_manager import ConfigManager from scraper import MarketTrendsScraper from logger import setup_logger def create_custom_config(): """Create a custom configuration for demonstration.""" return { "scraper": { "delay_between_requests": 2.0, "timeout": 30, "max_retries": 3, "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "headless": True, "window_size": [1920, 1080] }, "sources": [ { "name": "books_store", "url": "https://books.toscrape.com/", "type": "ecommerce", "enabled": True, "use_selenium": False, "selectors": { "product": "article.product_pod", "name": "h3 a", "price": "p.price_color", "rating": "p.star-rating", "availability": "p.instock.availability" }, "pagination": { "next_page": "li.next a", "max_pages": 3 } } ], "output": { "format": "csv", "include_timestamp": True, "filename": "books_analysis" }, "database": { "url": "sqlite:///../data/books.db", "echo": False }, "analysis": { "price_history_days": 30, "trend_threshold": 0.05, "generate_charts": True } } def custom_price_processing(price_text): """ Custom price processing function for specific formats. Args: price_text: Raw price text from website Returns: Parsed price as float or None if parsing fails """ try: # Remove currency symbol and whitespace price_clean = price_text.replace('£', '').strip() # Convert to float return float(price_clean) except (ValueError, TypeError): return None def custom_rating_processing(rating_element): """ Custom rating processing for star ratings. Args: rating_element: BeautifulSoup element containing rating Returns: Parsed rating as float or None if parsing fails """ try: # Extract rating from class name (e.g., "star-rating Three" -> 3) class_names = rating_element.get('class', []) for class_name in class_names: if class_name.lower() in ['one', 'two', 'three', 'four', 'five']: rating_map = { 'one': 1.0, 'two': 2.0, 'three': 3.0, 'four': 4.0, 'five': 5.0 } return rating_map.get(class_name.lower()) return None except Exception: return None def analyze_price_trends(data): """ Perform advanced price trend analysis. Args: data: List of product data dictionaries Returns: Dictionary containing trend analysis results """ if not data: return {"error": "No data available for analysis"} # Convert to DataFrame df = pd.DataFrame(data) # Ensure price is numeric df['price'] = pd.to_numeric(df['price'], errors='coerce') # Drop rows with missing prices df = df.dropna(subset=['price']) # Basic statistics analysis = { "total_products": len(df), "price_statistics": { "mean": float(df['price'].mean()), "median": float(df['price'].median()), "std": float(df['price'].std()), "min": float(df['price'].min()), "max": float(df['price'].max()), "quartiles": { "25%": float(df['price'].quantile(0.25)), "75%": float(df['price'].quantile(0.75)) } }, "price_distribution": { "under_10": int(len(df[df['price'] < 10])), "10_to_20": int(len(df[(df['price'] >= 10) & (df['price'] < 20)])), "20_to_30": int(len(df[(df['price'] >= 20) & (df['price'] < 30)])), "30_to_40": int(len(df[(df['price'] >= 30) & (df['price'] < 40)])), "over_40": int(len(df[df['price'] >= 40])) } } # Rating analysis if available if 'rating' in df.columns: df['rating'] = pd.to_numeric(df['rating'], errors='coerce') rating_df = df.dropna(subset=['rating']) if not rating_df.empty: analysis["rating_statistics"] = { "mean": float(rating_df['rating'].mean()), "distribution": rating_df['rating'].value_counts().to_dict() } return analysis def generate_visualizations(data, analysis, output_dir): """ Generate visualization charts for the analysis. Args: data: List of product data dictionaries analysis: Analysis results dictionary output_dir: Directory to save charts """ # Create output directory if it doesn't exist Path(output_dir).mkdir(parents=True, exist_ok=True) # Convert to DataFrame df = pd.DataFrame(data) # Ensure price is numeric df['price'] = pd.to_numeric(df['price'], errors='coerce') df = df.dropna(subset=['price']) # Set style sns.set(style="whitegrid") plt.figure(figsize=(12, 8)) # Price distribution histogram plt.subplot(2, 2, 1) sns.histplot(df['price'], bins=20, kde=True) plt.title('Price Distribution') plt.xlabel('Price (£)') plt.ylabel('Count') # Price distribution by category plt.subplot(2, 2, 2) price_dist = analysis['price_distribution'] categories = list(price_dist.keys()) values = list(price_dist.values()) plt.bar(categories, values) plt.title('Price Distribution by Category') plt.xlabel('Price Category') plt.ylabel('Count') plt.xticks(rotation=45) # Box plot for prices plt.subplot(2, 2, 3) sns.boxplot(y=df['price']) plt.title('Price Box Plot') plt.ylabel('Price (£)') # Rating vs Price scatter plot (if ratings available) plt.subplot(2, 2, 4) if 'rating' in df.columns: df['rating'] = pd.to_numeric(df['rating'], errors='coerce') rating_df = df.dropna(subset=['rating']) if not rating_df.empty: sns.scatterplot(x='rating', y='price', data=rating_df) plt.title('Rating vs Price') plt.xlabel('Rating') plt.ylabel('Price (£)') else: plt.text(0.5, 0.5, 'No rating data available', horizontalalignment='center', verticalalignment='center') else: plt.text(0.5, 0.5, 'No rating data available', horizontalalignment='center', verticalalignment='center') # Adjust layout and save plt.tight_layout() plt.savefig(f"{output_dir}/market_analysis_charts.png", dpi=300) plt.close() print(f"✓ Visualization charts saved to {output_dir}/market_analysis_charts.png") def main(): """Main function demonstrating advanced scraper usage.""" # Setup logging setup_logger(verbose=True) # Create custom configuration config = create_custom_config() print("✓ Custom configuration created") # Initialize scraper try: scraper = MarketTrendsScraper(config, headless=True) print("✓ Scraper initialized successfully") except Exception as e: print(f"✗ Failed to initialize scraper: {str(e)}") return 1 try: # Scrape market trends data print("\n🔍 Scraping market trends data...") data = scraper.scrape_market_trends() print(f"✓ Scraped {len(data)} product records") if not data: print("⚠ No data was scraped. Check your configuration and selectors.") return 0 # Save raw data output_file = "../data/advanced_example_output.csv" scraper.save_data(data, output_file) print(f"✓ Raw data saved to {output_file}") # Perform advanced analysis print("\n📊 Performing advanced analysis...") analysis = analyze_price_trends(data) print("✓ Advanced analysis completed") # Save analysis results analysis_file = "../data/advanced_example_analysis.json" with open(analysis_file, 'w') as f: json.dump(analysis, f, indent=2) print(f"✓ Analysis saved to {analysis_file}") # Generate visualizations print("\n📈 Generating visualization charts...") charts_dir = "../data/charts" generate_visualizations(data, analysis, charts_dir) # Print detailed summary print("\n📋 Detailed Summary:") print(f" - Total products: {analysis.get('total_products', 0)}") if 'price_statistics' in analysis: price_stats = analysis['price_statistics'] print(f" - Average price: £{price_stats.get('mean', 0):.2f}") print(f" - Median price: £{price_stats.get('median', 0):.2f}") print(f" - Standard deviation: £{price_stats.get('std', 0):.2f}") print(f" - Price range: £{price_stats.get('min', 0):.2f} - £{price_stats.get('max', 0):.2f}") if 'price_distribution' in analysis: print(" - Price distribution:") for category, count in analysis['price_distribution'].items(): print(f" * {category}: {count} products") if 'rating_statistics' in analysis: rating_stats = analysis['rating_statistics'] print(f" - Average rating: {rating_stats.get('mean', 0):.2f}") print(" - Rating distribution:") for rating, count in rating_stats['distribution'].items(): print(f" * {rating} stars: {count} products") print("\n✅ Advanced market trends analysis completed successfully!") return 0