328 lines
10 KiB
Python
328 lines
10 KiB
Python
|
|
#!/usr/bin/env python3
|
|
"""
|
|
Advanced Usage Example for Market Trends Scraper
|
|
|
|
This script demonstrates advanced features of the Market Trends Scraper,
|
|
including custom data processing, trend analysis, and visualization.
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
|
|
# Add src directory to Python path
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
|
|
|
|
from config_manager import ConfigManager
|
|
from scraper import MarketTrendsScraper
|
|
from logger import setup_logger
|
|
|
|
|
|
def create_custom_config():
|
|
"""Create a custom configuration for demonstration."""
|
|
return {
|
|
"scraper": {
|
|
"delay_between_requests": 2.0,
|
|
"timeout": 30,
|
|
"max_retries": 3,
|
|
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
"headless": True,
|
|
"window_size": [1920, 1080]
|
|
},
|
|
"sources": [
|
|
{
|
|
"name": "books_store",
|
|
"url": "https://books.toscrape.com/",
|
|
"type": "ecommerce",
|
|
"enabled": True,
|
|
"use_selenium": False,
|
|
"selectors": {
|
|
"product": "article.product_pod",
|
|
"name": "h3 a",
|
|
"price": "p.price_color",
|
|
"rating": "p.star-rating",
|
|
"availability": "p.instock.availability"
|
|
},
|
|
"pagination": {
|
|
"next_page": "li.next a",
|
|
"max_pages": 3
|
|
}
|
|
}
|
|
],
|
|
"output": {
|
|
"format": "csv",
|
|
"include_timestamp": True,
|
|
"filename": "books_analysis"
|
|
},
|
|
"database": {
|
|
"url": "sqlite:///../data/books.db",
|
|
"echo": False
|
|
},
|
|
"analysis": {
|
|
"price_history_days": 30,
|
|
"trend_threshold": 0.05,
|
|
"generate_charts": True
|
|
}
|
|
}
|
|
|
|
|
|
def custom_price_processing(price_text):
|
|
"""
|
|
Custom price processing function for specific formats.
|
|
|
|
Args:
|
|
price_text: Raw price text from website
|
|
|
|
Returns:
|
|
Parsed price as float or None if parsing fails
|
|
"""
|
|
try:
|
|
# Remove currency symbol and whitespace
|
|
price_clean = price_text.replace('£', '').strip()
|
|
|
|
# Convert to float
|
|
return float(price_clean)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
|
|
def custom_rating_processing(rating_element):
|
|
"""
|
|
Custom rating processing for star ratings.
|
|
|
|
Args:
|
|
rating_element: BeautifulSoup element containing rating
|
|
|
|
Returns:
|
|
Parsed rating as float or None if parsing fails
|
|
"""
|
|
try:
|
|
# Extract rating from class name (e.g., "star-rating Three" -> 3)
|
|
class_names = rating_element.get('class', [])
|
|
for class_name in class_names:
|
|
if class_name.lower() in ['one', 'two', 'three', 'four', 'five']:
|
|
rating_map = {
|
|
'one': 1.0,
|
|
'two': 2.0,
|
|
'three': 3.0,
|
|
'four': 4.0,
|
|
'five': 5.0
|
|
}
|
|
return rating_map.get(class_name.lower())
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def analyze_price_trends(data):
|
|
"""
|
|
Perform advanced price trend analysis.
|
|
|
|
Args:
|
|
data: List of product data dictionaries
|
|
|
|
Returns:
|
|
Dictionary containing trend analysis results
|
|
"""
|
|
if not data:
|
|
return {"error": "No data available for analysis"}
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(data)
|
|
|
|
# Ensure price is numeric
|
|
df['price'] = pd.to_numeric(df['price'], errors='coerce')
|
|
|
|
# Drop rows with missing prices
|
|
df = df.dropna(subset=['price'])
|
|
|
|
# Basic statistics
|
|
analysis = {
|
|
"total_products": len(df),
|
|
"price_statistics": {
|
|
"mean": float(df['price'].mean()),
|
|
"median": float(df['price'].median()),
|
|
"std": float(df['price'].std()),
|
|
"min": float(df['price'].min()),
|
|
"max": float(df['price'].max()),
|
|
"quartiles": {
|
|
"25%": float(df['price'].quantile(0.25)),
|
|
"75%": float(df['price'].quantile(0.75))
|
|
}
|
|
},
|
|
"price_distribution": {
|
|
"under_10": int(len(df[df['price'] < 10])),
|
|
"10_to_20": int(len(df[(df['price'] >= 10) & (df['price'] < 20)])),
|
|
"20_to_30": int(len(df[(df['price'] >= 20) & (df['price'] < 30)])),
|
|
"30_to_40": int(len(df[(df['price'] >= 30) & (df['price'] < 40)])),
|
|
"over_40": int(len(df[df['price'] >= 40]))
|
|
}
|
|
}
|
|
|
|
# Rating analysis if available
|
|
if 'rating' in df.columns:
|
|
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
|
|
rating_df = df.dropna(subset=['rating'])
|
|
|
|
if not rating_df.empty:
|
|
analysis["rating_statistics"] = {
|
|
"mean": float(rating_df['rating'].mean()),
|
|
"distribution": rating_df['rating'].value_counts().to_dict()
|
|
}
|
|
|
|
return analysis
|
|
|
|
|
|
def generate_visualizations(data, analysis, output_dir):
|
|
"""
|
|
Generate visualization charts for the analysis.
|
|
|
|
Args:
|
|
data: List of product data dictionaries
|
|
analysis: Analysis results dictionary
|
|
output_dir: Directory to save charts
|
|
"""
|
|
# Create output directory if it doesn't exist
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
# Convert to DataFrame
|
|
df = pd.DataFrame(data)
|
|
|
|
# Ensure price is numeric
|
|
df['price'] = pd.to_numeric(df['price'], errors='coerce')
|
|
df = df.dropna(subset=['price'])
|
|
|
|
# Set style
|
|
sns.set(style="whitegrid")
|
|
plt.figure(figsize=(12, 8))
|
|
|
|
# Price distribution histogram
|
|
plt.subplot(2, 2, 1)
|
|
sns.histplot(df['price'], bins=20, kde=True)
|
|
plt.title('Price Distribution')
|
|
plt.xlabel('Price (£)')
|
|
plt.ylabel('Count')
|
|
|
|
# Price distribution by category
|
|
plt.subplot(2, 2, 2)
|
|
price_dist = analysis['price_distribution']
|
|
categories = list(price_dist.keys())
|
|
values = list(price_dist.values())
|
|
plt.bar(categories, values)
|
|
plt.title('Price Distribution by Category')
|
|
plt.xlabel('Price Category')
|
|
plt.ylabel('Count')
|
|
plt.xticks(rotation=45)
|
|
|
|
# Box plot for prices
|
|
plt.subplot(2, 2, 3)
|
|
sns.boxplot(y=df['price'])
|
|
plt.title('Price Box Plot')
|
|
plt.ylabel('Price (£)')
|
|
|
|
# Rating vs Price scatter plot (if ratings available)
|
|
plt.subplot(2, 2, 4)
|
|
if 'rating' in df.columns:
|
|
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
|
|
rating_df = df.dropna(subset=['rating'])
|
|
if not rating_df.empty:
|
|
sns.scatterplot(x='rating', y='price', data=rating_df)
|
|
plt.title('Rating vs Price')
|
|
plt.xlabel('Rating')
|
|
plt.ylabel('Price (£)')
|
|
else:
|
|
plt.text(0.5, 0.5, 'No rating data available',
|
|
horizontalalignment='center', verticalalignment='center')
|
|
else:
|
|
plt.text(0.5, 0.5, 'No rating data available',
|
|
horizontalalignment='center', verticalalignment='center')
|
|
|
|
# Adjust layout and save
|
|
plt.tight_layout()
|
|
plt.savefig(f"{output_dir}/market_analysis_charts.png", dpi=300)
|
|
plt.close()
|
|
|
|
print(f"✓ Visualization charts saved to {output_dir}/market_analysis_charts.png")
|
|
|
|
|
|
def main():
|
|
"""Main function demonstrating advanced scraper usage."""
|
|
|
|
# Setup logging
|
|
setup_logger(verbose=True)
|
|
|
|
# Create custom configuration
|
|
config = create_custom_config()
|
|
print("✓ Custom configuration created")
|
|
|
|
# Initialize scraper
|
|
try:
|
|
scraper = MarketTrendsScraper(config, headless=True)
|
|
print("✓ Scraper initialized successfully")
|
|
except Exception as e:
|
|
print(f"✗ Failed to initialize scraper: {str(e)}")
|
|
return 1
|
|
|
|
try:
|
|
# Scrape market trends data
|
|
print("\n🔍 Scraping market trends data...")
|
|
data = scraper.scrape_market_trends()
|
|
print(f"✓ Scraped {len(data)} product records")
|
|
|
|
if not data:
|
|
print("⚠ No data was scraped. Check your configuration and selectors.")
|
|
return 0
|
|
|
|
# Save raw data
|
|
output_file = "../data/advanced_example_output.csv"
|
|
scraper.save_data(data, output_file)
|
|
print(f"✓ Raw data saved to {output_file}")
|
|
|
|
# Perform advanced analysis
|
|
print("\n📊 Performing advanced analysis...")
|
|
analysis = analyze_price_trends(data)
|
|
print("✓ Advanced analysis completed")
|
|
|
|
# Save analysis results
|
|
analysis_file = "../data/advanced_example_analysis.json"
|
|
with open(analysis_file, 'w') as f:
|
|
json.dump(analysis, f, indent=2)
|
|
print(f"✓ Analysis saved to {analysis_file}")
|
|
|
|
# Generate visualizations
|
|
print("\n📈 Generating visualization charts...")
|
|
charts_dir = "../data/charts"
|
|
generate_visualizations(data, analysis, charts_dir)
|
|
|
|
# Print detailed summary
|
|
print("\n📋 Detailed Summary:")
|
|
print(f" - Total products: {analysis.get('total_products', 0)}")
|
|
|
|
if 'price_statistics' in analysis:
|
|
price_stats = analysis['price_statistics']
|
|
print(f" - Average price: £{price_stats.get('mean', 0):.2f}")
|
|
print(f" - Median price: £{price_stats.get('median', 0):.2f}")
|
|
print(f" - Standard deviation: £{price_stats.get('std', 0):.2f}")
|
|
print(f" - Price range: £{price_stats.get('min', 0):.2f} - £{price_stats.get('max', 0):.2f}")
|
|
|
|
if 'price_distribution' in analysis:
|
|
print(" - Price distribution:")
|
|
for category, count in analysis['price_distribution'].items():
|
|
print(f" * {category}: {count} products")
|
|
|
|
if 'rating_statistics' in analysis:
|
|
rating_stats = analysis['rating_statistics']
|
|
print(f" - Average rating: {rating_stats.get('mean', 0):.2f}")
|
|
print(" - Rating distribution:")
|
|
for rating, count in rating_stats['distribution'].items():
|
|
print(f" * {rating} stars: {count} products")
|
|
|
|
print("\n✅ Advanced market trends analysis completed successfully!")
|
|
return 0
|
|
|