Initial commit: Market Trends Scraper
This commit is contained in:
327
examples/advanced_usage.py
Normal file
327
examples/advanced_usage.py
Normal file
@@ -0,0 +1,327 @@
|
||||
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Advanced Usage Example for Market Trends Scraper
|
||||
|
||||
This script demonstrates advanced features of the Market Trends Scraper,
|
||||
including custom data processing, trend analysis, and visualization.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
# Add src directory to Python path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
|
||||
|
||||
from config_manager import ConfigManager
|
||||
from scraper import MarketTrendsScraper
|
||||
from logger import setup_logger
|
||||
|
||||
|
||||
def create_custom_config():
|
||||
"""Create a custom configuration for demonstration."""
|
||||
return {
|
||||
"scraper": {
|
||||
"delay_between_requests": 2.0,
|
||||
"timeout": 30,
|
||||
"max_retries": 3,
|
||||
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
"headless": True,
|
||||
"window_size": [1920, 1080]
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"name": "books_store",
|
||||
"url": "https://books.toscrape.com/",
|
||||
"type": "ecommerce",
|
||||
"enabled": True,
|
||||
"use_selenium": False,
|
||||
"selectors": {
|
||||
"product": "article.product_pod",
|
||||
"name": "h3 a",
|
||||
"price": "p.price_color",
|
||||
"rating": "p.star-rating",
|
||||
"availability": "p.instock.availability"
|
||||
},
|
||||
"pagination": {
|
||||
"next_page": "li.next a",
|
||||
"max_pages": 3
|
||||
}
|
||||
}
|
||||
],
|
||||
"output": {
|
||||
"format": "csv",
|
||||
"include_timestamp": True,
|
||||
"filename": "books_analysis"
|
||||
},
|
||||
"database": {
|
||||
"url": "sqlite:///../data/books.db",
|
||||
"echo": False
|
||||
},
|
||||
"analysis": {
|
||||
"price_history_days": 30,
|
||||
"trend_threshold": 0.05,
|
||||
"generate_charts": True
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def custom_price_processing(price_text):
|
||||
"""
|
||||
Custom price processing function for specific formats.
|
||||
|
||||
Args:
|
||||
price_text: Raw price text from website
|
||||
|
||||
Returns:
|
||||
Parsed price as float or None if parsing fails
|
||||
"""
|
||||
try:
|
||||
# Remove currency symbol and whitespace
|
||||
price_clean = price_text.replace('£', '').strip()
|
||||
|
||||
# Convert to float
|
||||
return float(price_clean)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def custom_rating_processing(rating_element):
|
||||
"""
|
||||
Custom rating processing for star ratings.
|
||||
|
||||
Args:
|
||||
rating_element: BeautifulSoup element containing rating
|
||||
|
||||
Returns:
|
||||
Parsed rating as float or None if parsing fails
|
||||
"""
|
||||
try:
|
||||
# Extract rating from class name (e.g., "star-rating Three" -> 3)
|
||||
class_names = rating_element.get('class', [])
|
||||
for class_name in class_names:
|
||||
if class_name.lower() in ['one', 'two', 'three', 'four', 'five']:
|
||||
rating_map = {
|
||||
'one': 1.0,
|
||||
'two': 2.0,
|
||||
'three': 3.0,
|
||||
'four': 4.0,
|
||||
'five': 5.0
|
||||
}
|
||||
return rating_map.get(class_name.lower())
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def analyze_price_trends(data):
|
||||
"""
|
||||
Perform advanced price trend analysis.
|
||||
|
||||
Args:
|
||||
data: List of product data dictionaries
|
||||
|
||||
Returns:
|
||||
Dictionary containing trend analysis results
|
||||
"""
|
||||
if not data:
|
||||
return {"error": "No data available for analysis"}
|
||||
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Ensure price is numeric
|
||||
df['price'] = pd.to_numeric(df['price'], errors='coerce')
|
||||
|
||||
# Drop rows with missing prices
|
||||
df = df.dropna(subset=['price'])
|
||||
|
||||
# Basic statistics
|
||||
analysis = {
|
||||
"total_products": len(df),
|
||||
"price_statistics": {
|
||||
"mean": float(df['price'].mean()),
|
||||
"median": float(df['price'].median()),
|
||||
"std": float(df['price'].std()),
|
||||
"min": float(df['price'].min()),
|
||||
"max": float(df['price'].max()),
|
||||
"quartiles": {
|
||||
"25%": float(df['price'].quantile(0.25)),
|
||||
"75%": float(df['price'].quantile(0.75))
|
||||
}
|
||||
},
|
||||
"price_distribution": {
|
||||
"under_10": int(len(df[df['price'] < 10])),
|
||||
"10_to_20": int(len(df[(df['price'] >= 10) & (df['price'] < 20)])),
|
||||
"20_to_30": int(len(df[(df['price'] >= 20) & (df['price'] < 30)])),
|
||||
"30_to_40": int(len(df[(df['price'] >= 30) & (df['price'] < 40)])),
|
||||
"over_40": int(len(df[df['price'] >= 40]))
|
||||
}
|
||||
}
|
||||
|
||||
# Rating analysis if available
|
||||
if 'rating' in df.columns:
|
||||
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
|
||||
rating_df = df.dropna(subset=['rating'])
|
||||
|
||||
if not rating_df.empty:
|
||||
analysis["rating_statistics"] = {
|
||||
"mean": float(rating_df['rating'].mean()),
|
||||
"distribution": rating_df['rating'].value_counts().to_dict()
|
||||
}
|
||||
|
||||
return analysis
|
||||
|
||||
|
||||
def generate_visualizations(data, analysis, output_dir):
|
||||
"""
|
||||
Generate visualization charts for the analysis.
|
||||
|
||||
Args:
|
||||
data: List of product data dictionaries
|
||||
analysis: Analysis results dictionary
|
||||
output_dir: Directory to save charts
|
||||
"""
|
||||
# Create output directory if it doesn't exist
|
||||
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Ensure price is numeric
|
||||
df['price'] = pd.to_numeric(df['price'], errors='coerce')
|
||||
df = df.dropna(subset=['price'])
|
||||
|
||||
# Set style
|
||||
sns.set(style="whitegrid")
|
||||
plt.figure(figsize=(12, 8))
|
||||
|
||||
# Price distribution histogram
|
||||
plt.subplot(2, 2, 1)
|
||||
sns.histplot(df['price'], bins=20, kde=True)
|
||||
plt.title('Price Distribution')
|
||||
plt.xlabel('Price (£)')
|
||||
plt.ylabel('Count')
|
||||
|
||||
# Price distribution by category
|
||||
plt.subplot(2, 2, 2)
|
||||
price_dist = analysis['price_distribution']
|
||||
categories = list(price_dist.keys())
|
||||
values = list(price_dist.values())
|
||||
plt.bar(categories, values)
|
||||
plt.title('Price Distribution by Category')
|
||||
plt.xlabel('Price Category')
|
||||
plt.ylabel('Count')
|
||||
plt.xticks(rotation=45)
|
||||
|
||||
# Box plot for prices
|
||||
plt.subplot(2, 2, 3)
|
||||
sns.boxplot(y=df['price'])
|
||||
plt.title('Price Box Plot')
|
||||
plt.ylabel('Price (£)')
|
||||
|
||||
# Rating vs Price scatter plot (if ratings available)
|
||||
plt.subplot(2, 2, 4)
|
||||
if 'rating' in df.columns:
|
||||
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
|
||||
rating_df = df.dropna(subset=['rating'])
|
||||
if not rating_df.empty:
|
||||
sns.scatterplot(x='rating', y='price', data=rating_df)
|
||||
plt.title('Rating vs Price')
|
||||
plt.xlabel('Rating')
|
||||
plt.ylabel('Price (£)')
|
||||
else:
|
||||
plt.text(0.5, 0.5, 'No rating data available',
|
||||
horizontalalignment='center', verticalalignment='center')
|
||||
else:
|
||||
plt.text(0.5, 0.5, 'No rating data available',
|
||||
horizontalalignment='center', verticalalignment='center')
|
||||
|
||||
# Adjust layout and save
|
||||
plt.tight_layout()
|
||||
plt.savefig(f"{output_dir}/market_analysis_charts.png", dpi=300)
|
||||
plt.close()
|
||||
|
||||
print(f"✓ Visualization charts saved to {output_dir}/market_analysis_charts.png")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function demonstrating advanced scraper usage."""
|
||||
|
||||
# Setup logging
|
||||
setup_logger(verbose=True)
|
||||
|
||||
# Create custom configuration
|
||||
config = create_custom_config()
|
||||
print("✓ Custom configuration created")
|
||||
|
||||
# Initialize scraper
|
||||
try:
|
||||
scraper = MarketTrendsScraper(config, headless=True)
|
||||
print("✓ Scraper initialized successfully")
|
||||
except Exception as e:
|
||||
print(f"✗ Failed to initialize scraper: {str(e)}")
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Scrape market trends data
|
||||
print("\n🔍 Scraping market trends data...")
|
||||
data = scraper.scrape_market_trends()
|
||||
print(f"✓ Scraped {len(data)} product records")
|
||||
|
||||
if not data:
|
||||
print("⚠ No data was scraped. Check your configuration and selectors.")
|
||||
return 0
|
||||
|
||||
# Save raw data
|
||||
output_file = "../data/advanced_example_output.csv"
|
||||
scraper.save_data(data, output_file)
|
||||
print(f"✓ Raw data saved to {output_file}")
|
||||
|
||||
# Perform advanced analysis
|
||||
print("\n📊 Performing advanced analysis...")
|
||||
analysis = analyze_price_trends(data)
|
||||
print("✓ Advanced analysis completed")
|
||||
|
||||
# Save analysis results
|
||||
analysis_file = "../data/advanced_example_analysis.json"
|
||||
with open(analysis_file, 'w') as f:
|
||||
json.dump(analysis, f, indent=2)
|
||||
print(f"✓ Analysis saved to {analysis_file}")
|
||||
|
||||
# Generate visualizations
|
||||
print("\n📈 Generating visualization charts...")
|
||||
charts_dir = "../data/charts"
|
||||
generate_visualizations(data, analysis, charts_dir)
|
||||
|
||||
# Print detailed summary
|
||||
print("\n📋 Detailed Summary:")
|
||||
print(f" - Total products: {analysis.get('total_products', 0)}")
|
||||
|
||||
if 'price_statistics' in analysis:
|
||||
price_stats = analysis['price_statistics']
|
||||
print(f" - Average price: £{price_stats.get('mean', 0):.2f}")
|
||||
print(f" - Median price: £{price_stats.get('median', 0):.2f}")
|
||||
print(f" - Standard deviation: £{price_stats.get('std', 0):.2f}")
|
||||
print(f" - Price range: £{price_stats.get('min', 0):.2f} - £{price_stats.get('max', 0):.2f}")
|
||||
|
||||
if 'price_distribution' in analysis:
|
||||
print(" - Price distribution:")
|
||||
for category, count in analysis['price_distribution'].items():
|
||||
print(f" * {category}: {count} products")
|
||||
|
||||
if 'rating_statistics' in analysis:
|
||||
rating_stats = analysis['rating_statistics']
|
||||
print(f" - Average rating: {rating_stats.get('mean', 0):.2f}")
|
||||
print(" - Rating distribution:")
|
||||
for rating, count in rating_stats['distribution'].items():
|
||||
print(f" * {rating} stars: {count} products")
|
||||
|
||||
print("\n✅ Advanced market trends analysis completed successfully!")
|
||||
return 0
|
||||
|
Reference in New Issue
Block a user