Initial commit: Market Trends Scraper

This commit is contained in:
Dev
2025-09-11 17:46:14 +03:00
commit 4ddcde68d4
17 changed files with 3049 additions and 0 deletions

327
examples/advanced_usage.py Normal file
View File

@@ -0,0 +1,327 @@
#!/usr/bin/env python3
"""
Advanced Usage Example for Market Trends Scraper
This script demonstrates advanced features of the Market Trends Scraper,
including custom data processing, trend analysis, and visualization.
"""
import sys
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path
# Add src directory to Python path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src"))
from config_manager import ConfigManager
from scraper import MarketTrendsScraper
from logger import setup_logger
def create_custom_config():
"""Create a custom configuration for demonstration."""
return {
"scraper": {
"delay_between_requests": 2.0,
"timeout": 30,
"max_retries": 3,
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"headless": True,
"window_size": [1920, 1080]
},
"sources": [
{
"name": "books_store",
"url": "https://books.toscrape.com/",
"type": "ecommerce",
"enabled": True,
"use_selenium": False,
"selectors": {
"product": "article.product_pod",
"name": "h3 a",
"price": "p.price_color",
"rating": "p.star-rating",
"availability": "p.instock.availability"
},
"pagination": {
"next_page": "li.next a",
"max_pages": 3
}
}
],
"output": {
"format": "csv",
"include_timestamp": True,
"filename": "books_analysis"
},
"database": {
"url": "sqlite:///../data/books.db",
"echo": False
},
"analysis": {
"price_history_days": 30,
"trend_threshold": 0.05,
"generate_charts": True
}
}
def custom_price_processing(price_text):
"""
Custom price processing function for specific formats.
Args:
price_text: Raw price text from website
Returns:
Parsed price as float or None if parsing fails
"""
try:
# Remove currency symbol and whitespace
price_clean = price_text.replace('£', '').strip()
# Convert to float
return float(price_clean)
except (ValueError, TypeError):
return None
def custom_rating_processing(rating_element):
"""
Custom rating processing for star ratings.
Args:
rating_element: BeautifulSoup element containing rating
Returns:
Parsed rating as float or None if parsing fails
"""
try:
# Extract rating from class name (e.g., "star-rating Three" -> 3)
class_names = rating_element.get('class', [])
for class_name in class_names:
if class_name.lower() in ['one', 'two', 'three', 'four', 'five']:
rating_map = {
'one': 1.0,
'two': 2.0,
'three': 3.0,
'four': 4.0,
'five': 5.0
}
return rating_map.get(class_name.lower())
return None
except Exception:
return None
def analyze_price_trends(data):
"""
Perform advanced price trend analysis.
Args:
data: List of product data dictionaries
Returns:
Dictionary containing trend analysis results
"""
if not data:
return {"error": "No data available for analysis"}
# Convert to DataFrame
df = pd.DataFrame(data)
# Ensure price is numeric
df['price'] = pd.to_numeric(df['price'], errors='coerce')
# Drop rows with missing prices
df = df.dropna(subset=['price'])
# Basic statistics
analysis = {
"total_products": len(df),
"price_statistics": {
"mean": float(df['price'].mean()),
"median": float(df['price'].median()),
"std": float(df['price'].std()),
"min": float(df['price'].min()),
"max": float(df['price'].max()),
"quartiles": {
"25%": float(df['price'].quantile(0.25)),
"75%": float(df['price'].quantile(0.75))
}
},
"price_distribution": {
"under_10": int(len(df[df['price'] < 10])),
"10_to_20": int(len(df[(df['price'] >= 10) & (df['price'] < 20)])),
"20_to_30": int(len(df[(df['price'] >= 20) & (df['price'] < 30)])),
"30_to_40": int(len(df[(df['price'] >= 30) & (df['price'] < 40)])),
"over_40": int(len(df[df['price'] >= 40]))
}
}
# Rating analysis if available
if 'rating' in df.columns:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
rating_df = df.dropna(subset=['rating'])
if not rating_df.empty:
analysis["rating_statistics"] = {
"mean": float(rating_df['rating'].mean()),
"distribution": rating_df['rating'].value_counts().to_dict()
}
return analysis
def generate_visualizations(data, analysis, output_dir):
"""
Generate visualization charts for the analysis.
Args:
data: List of product data dictionaries
analysis: Analysis results dictionary
output_dir: Directory to save charts
"""
# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Convert to DataFrame
df = pd.DataFrame(data)
# Ensure price is numeric
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df = df.dropna(subset=['price'])
# Set style
sns.set(style="whitegrid")
plt.figure(figsize=(12, 8))
# Price distribution histogram
plt.subplot(2, 2, 1)
sns.histplot(df['price'], bins=20, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price (£)')
plt.ylabel('Count')
# Price distribution by category
plt.subplot(2, 2, 2)
price_dist = analysis['price_distribution']
categories = list(price_dist.keys())
values = list(price_dist.values())
plt.bar(categories, values)
plt.title('Price Distribution by Category')
plt.xlabel('Price Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
# Box plot for prices
plt.subplot(2, 2, 3)
sns.boxplot(y=df['price'])
plt.title('Price Box Plot')
plt.ylabel('Price (£)')
# Rating vs Price scatter plot (if ratings available)
plt.subplot(2, 2, 4)
if 'rating' in df.columns:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
rating_df = df.dropna(subset=['rating'])
if not rating_df.empty:
sns.scatterplot(x='rating', y='price', data=rating_df)
plt.title('Rating vs Price')
plt.xlabel('Rating')
plt.ylabel('Price (£)')
else:
plt.text(0.5, 0.5, 'No rating data available',
horizontalalignment='center', verticalalignment='center')
else:
plt.text(0.5, 0.5, 'No rating data available',
horizontalalignment='center', verticalalignment='center')
# Adjust layout and save
plt.tight_layout()
plt.savefig(f"{output_dir}/market_analysis_charts.png", dpi=300)
plt.close()
print(f"✓ Visualization charts saved to {output_dir}/market_analysis_charts.png")
def main():
"""Main function demonstrating advanced scraper usage."""
# Setup logging
setup_logger(verbose=True)
# Create custom configuration
config = create_custom_config()
print("✓ Custom configuration created")
# Initialize scraper
try:
scraper = MarketTrendsScraper(config, headless=True)
print("✓ Scraper initialized successfully")
except Exception as e:
print(f"✗ Failed to initialize scraper: {str(e)}")
return 1
try:
# Scrape market trends data
print("\n🔍 Scraping market trends data...")
data = scraper.scrape_market_trends()
print(f"✓ Scraped {len(data)} product records")
if not data:
print("⚠ No data was scraped. Check your configuration and selectors.")
return 0
# Save raw data
output_file = "../data/advanced_example_output.csv"
scraper.save_data(data, output_file)
print(f"✓ Raw data saved to {output_file}")
# Perform advanced analysis
print("\n📊 Performing advanced analysis...")
analysis = analyze_price_trends(data)
print("✓ Advanced analysis completed")
# Save analysis results
analysis_file = "../data/advanced_example_analysis.json"
with open(analysis_file, 'w') as f:
json.dump(analysis, f, indent=2)
print(f"✓ Analysis saved to {analysis_file}")
# Generate visualizations
print("\n📈 Generating visualization charts...")
charts_dir = "../data/charts"
generate_visualizations(data, analysis, charts_dir)
# Print detailed summary
print("\n📋 Detailed Summary:")
print(f" - Total products: {analysis.get('total_products', 0)}")
if 'price_statistics' in analysis:
price_stats = analysis['price_statistics']
print(f" - Average price: £{price_stats.get('mean', 0):.2f}")
print(f" - Median price: £{price_stats.get('median', 0):.2f}")
print(f" - Standard deviation: £{price_stats.get('std', 0):.2f}")
print(f" - Price range: £{price_stats.get('min', 0):.2f} - £{price_stats.get('max', 0):.2f}")
if 'price_distribution' in analysis:
print(" - Price distribution:")
for category, count in analysis['price_distribution'].items():
print(f" * {category}: {count} products")
if 'rating_statistics' in analysis:
rating_stats = analysis['rating_statistics']
print(f" - Average rating: {rating_stats.get('mean', 0):.2f}")
print(" - Rating distribution:")
for rating, count in rating_stats['distribution'].items():
print(f" * {rating} stars: {count} products")
print("\n✅ Advanced market trends analysis completed successfully!")
return 0