Initial commit: Market Trends Scraper
This commit is contained in:
618
src/scraper.py
Normal file
618
src/scraper.py
Normal file
@@ -0,0 +1,618 @@
|
||||
"""
|
||||
Web Scraper Module
|
||||
|
||||
This module contains the core functionality for scraping e-commerce websites
|
||||
to collect product and pricing data for market trend analysis.
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
import random
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional, Union
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import pandas as pd
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.common.exceptions import TimeoutException, NoSuchElementException
|
||||
|
||||
from logger import LoggerMixin
|
||||
from config_manager import ConfigManager
|
||||
|
||||
|
||||
class MarketTrendsScraper(LoggerMixin):
|
||||
"""
|
||||
Main scraper class for collecting market trends data from e-commerce websites.
|
||||
"""
|
||||
|
||||
def __init__(self, config: Dict[str, Any], headless: bool = True):
|
||||
"""
|
||||
Initialize the scraper with configuration.
|
||||
|
||||
Args:
|
||||
config: Configuration dictionary
|
||||
headless: Whether to run browser in headless mode
|
||||
"""
|
||||
self.config = config
|
||||
self.driver = None
|
||||
self.session = requests.Session()
|
||||
self.data = []
|
||||
self._setup_browser(headless)
|
||||
self._setup_session()
|
||||
|
||||
def _setup_browser(self, headless: bool = True) -> None:
|
||||
"""
|
||||
Set up the Selenium WebDriver with appropriate options.
|
||||
|
||||
Args:
|
||||
headless: Whether to run browser in headless mode
|
||||
"""
|
||||
try:
|
||||
chrome_options = Options()
|
||||
|
||||
# Set headless mode
|
||||
if headless:
|
||||
chrome_options.add_argument("--headless")
|
||||
|
||||
# Set window size
|
||||
window_size = self.config.get("scraper.window_size", [1920, 1080])
|
||||
chrome_options.add_argument(f"--window-size={window_size[0]},{window_size[1]}")
|
||||
|
||||
# Add other options for stability
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
chrome_options.add_argument("--disable-gpu")
|
||||
chrome_options.add_argument("--disable-web-security")
|
||||
chrome_options.add_argument("--disable-features=VizDisplayCompositor")
|
||||
|
||||
# Set user agent
|
||||
user_agent = self.config.get("scraper.user_agent", "")
|
||||
if user_agent:
|
||||
chrome_options.add_argument(f"--user-agent={user_agent}")
|
||||
|
||||
# Initialize driver
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
self.driver.set_page_load_timeout(self.config.get("scraper.timeout", 30))
|
||||
|
||||
self.logger.info("Browser setup completed")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to setup browser: {str(e)}")
|
||||
raise
|
||||
|
||||
def _setup_session(self) -> None:
|
||||
"""
|
||||
Set up the requests session with appropriate headers.
|
||||
"""
|
||||
user_agent = self.config.get("scraper.user_agent", "")
|
||||
if user_agent:
|
||||
self.session.headers.update({"User-Agent": user_agent})
|
||||
|
||||
# Add other headers
|
||||
self.session.headers.update({
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1"
|
||||
})
|
||||
|
||||
self.logger.info("Session setup completed")
|
||||
|
||||
def scrape_market_trends(self) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape market trends data from all configured sources.
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing scraped data
|
||||
"""
|
||||
all_data = []
|
||||
sources = self.config.get("sources", [])
|
||||
|
||||
for source in sources:
|
||||
if not source.get("enabled", True):
|
||||
self.logger.info(f"Skipping disabled source: {source.get('name', 'Unknown')}")
|
||||
continue
|
||||
|
||||
self.logger.info(f"Scraping source: {source.get('name', 'Unknown')}")
|
||||
|
||||
try:
|
||||
source_data = self._scrape_source(source)
|
||||
all_data.extend(source_data)
|
||||
self.logger.info(f"Scraped {len(source_data)} records from {source.get('name', 'Unknown')}")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to scrape source {source.get('name', 'Unknown')}: {str(e)}")
|
||||
continue
|
||||
|
||||
return all_data
|
||||
|
||||
def _scrape_source(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape data from a specific source.
|
||||
|
||||
Args:
|
||||
source: Source configuration dictionary
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing scraped data
|
||||
"""
|
||||
source_data = []
|
||||
url = source.get("url", "")
|
||||
selectors = source.get("selectors", {})
|
||||
pagination = source.get("pagination", {})
|
||||
|
||||
if not url:
|
||||
self.logger.warning(f"No URL configured for source: {source.get('name', 'Unknown')}")
|
||||
return source_data
|
||||
|
||||
# Determine scraping method
|
||||
use_selenium = source.get("use_selenium", False)
|
||||
|
||||
if use_selenium:
|
||||
return self._scrape_with_selenium(source)
|
||||
else:
|
||||
return self._scrape_with_requests(source)
|
||||
|
||||
def _scrape_with_requests(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape data using requests and BeautifulSoup.
|
||||
|
||||
Args:
|
||||
source: Source configuration dictionary
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing scraped data
|
||||
"""
|
||||
source_data = []
|
||||
url = source.get("url", "")
|
||||
selectors = source.get("selectors", {})
|
||||
pagination = source.get("pagination", {})
|
||||
max_pages = pagination.get("max_pages", 1)
|
||||
|
||||
for page in range(1, max_pages + 1):
|
||||
try:
|
||||
# Add page parameter if needed
|
||||
page_url = url
|
||||
if page > 1:
|
||||
page_url = f"{url}?page={page}"
|
||||
|
||||
self.logger.debug(f"Scraping page {page}: {page_url}")
|
||||
|
||||
# Make request with retry logic
|
||||
response = self._make_request_with_retry(page_url)
|
||||
|
||||
if not response:
|
||||
continue
|
||||
|
||||
# Parse HTML
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Extract product data
|
||||
products = soup.select(selectors.get("product", ""))
|
||||
|
||||
for product in products:
|
||||
product_data = self._extract_product_data(product, selectors)
|
||||
if product_data:
|
||||
product_data["source"] = source.get("name", "Unknown")
|
||||
product_data["scraped_at"] = datetime.now().isoformat()
|
||||
source_data.append(product_data)
|
||||
|
||||
# Check if there's a next page
|
||||
if page < max_pages:
|
||||
next_page = soup.select_one(pagination.get("next_page", ""))
|
||||
if not next_page:
|
||||
self.logger.debug(f"No more pages found after page {page}")
|
||||
break
|
||||
|
||||
# Delay between requests
|
||||
delay = self.config.get("scraper.delay_between_requests", 1.0)
|
||||
time.sleep(delay + random.uniform(0, 1))
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error scraping page {page} from {source.get('name', 'Unknown')}: {str(e)}")
|
||||
continue
|
||||
|
||||
return source_data
|
||||
|
||||
def _scrape_with_selenium(self, source: Dict[str, Any]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Scrape data using Selenium WebDriver.
|
||||
|
||||
Args:
|
||||
source: Source configuration dictionary
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing scraped data
|
||||
"""
|
||||
source_data = []
|
||||
url = source.get("url", "")
|
||||
selectors = source.get("selectors", {})
|
||||
pagination = source.get("pagination", {})
|
||||
max_pages = pagination.get("max_pages", 1)
|
||||
|
||||
try:
|
||||
self.driver.get(url)
|
||||
|
||||
for page in range(1, max_pages + 1):
|
||||
self.logger.debug(f"Scraping page {page} with Selenium")
|
||||
|
||||
# Wait for products to load
|
||||
try:
|
||||
WebDriverWait(self.driver, 10).until(
|
||||
EC.presence_of_element_located((By.CSS_SELECTOR, selectors.get("product", "")))
|
||||
)
|
||||
except TimeoutException:
|
||||
self.logger.warning(f"Timeout waiting for products to load on page {page}")
|
||||
continue
|
||||
|
||||
# Extract product data
|
||||
products = self.driver.find_elements(By.CSS_SELECTOR, selectors.get("product", ""))
|
||||
|
||||
for product in products:
|
||||
product_data = self._extract_product_data_selenium(product, selectors)
|
||||
if product_data:
|
||||
product_data["source"] = source.get("name", "Unknown")
|
||||
product_data["scraped_at"] = datetime.now().isoformat()
|
||||
source_data.append(product_data)
|
||||
|
||||
# Navigate to next page if available
|
||||
if page < max_pages:
|
||||
try:
|
||||
next_button = self.driver.find_element(By.CSS_SELECTOR, pagination.get("next_page", ""))
|
||||
next_button.click()
|
||||
|
||||
# Wait for page to load
|
||||
time.sleep(2)
|
||||
except NoSuchElementException:
|
||||
self.logger.debug(f"No next page button found after page {page}")
|
||||
break
|
||||
|
||||
# Delay between requests
|
||||
delay = self.config.get("scraper.delay_between_requests", 1.0)
|
||||
time.sleep(delay + random.uniform(0, 1))
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error scraping with Selenium from {source.get('name', 'Unknown')}: {str(e)}")
|
||||
|
||||
return source_data
|
||||
|
||||
def _make_request_with_retry(self, url: str) -> Optional[requests.Response]:
|
||||
"""
|
||||
Make HTTP request with retry logic.
|
||||
|
||||
Args:
|
||||
url: URL to request
|
||||
|
||||
Returns:
|
||||
Response object or None if failed
|
||||
"""
|
||||
max_retries = self.config.get("scraper.max_retries", 3)
|
||||
timeout = self.config.get("scraper.timeout", 30)
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = self.session.get(url, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except requests.RequestException as e:
|
||||
self.logger.warning(f"Request attempt {attempt + 1} failed for {url}: {str(e)}")
|
||||
if attempt < max_retries - 1:
|
||||
# Exponential backoff
|
||||
time.sleep((2 ** attempt) + random.uniform(0, 1))
|
||||
else:
|
||||
self.logger.error(f"Max retries exceeded for {url}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
def _extract_product_data(self, product: BeautifulSoup, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract product data from HTML element using BeautifulSoup.
|
||||
|
||||
Args:
|
||||
product: BeautifulSoup element containing product data
|
||||
selectors: Dictionary of CSS selectors
|
||||
|
||||
Returns:
|
||||
Dictionary containing product data or None if extraction failed
|
||||
"""
|
||||
try:
|
||||
data = {}
|
||||
|
||||
# Extract name
|
||||
name_element = product.select_one(selectors.get("name", ""))
|
||||
data["name"] = name_element.get_text(strip=True) if name_element else None
|
||||
|
||||
# Extract price
|
||||
price_element = product.select_one(selectors.get("price", ""))
|
||||
if price_element:
|
||||
price_text = price_element.get_text(strip=True)
|
||||
data["price"] = self._parse_price(price_text)
|
||||
else:
|
||||
data["price"] = None
|
||||
|
||||
# Extract rating
|
||||
rating_element = product.select_one(selectors.get("rating", ""))
|
||||
if rating_element:
|
||||
rating_text = rating_element.get_text(strip=True)
|
||||
data["rating"] = self._parse_rating(rating_text)
|
||||
else:
|
||||
data["rating"] = None
|
||||
|
||||
# Extract availability
|
||||
availability_element = product.select_one(selectors.get("availability", ""))
|
||||
data["availability"] = availability_element.get_text(strip=True) if availability_element else None
|
||||
|
||||
# Extract URL if available
|
||||
link_element = product.select_one("a")
|
||||
if link_element and link_element.get("href"):
|
||||
data["url"] = link_element.get("href")
|
||||
|
||||
return data if data.get("name") else None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error extracting product data: {str(e)}")
|
||||
return None
|
||||
|
||||
def _extract_product_data_selenium(self, product, selectors: Dict[str, str]) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Extract product data from Selenium WebElement.
|
||||
|
||||
Args:
|
||||
product: Selenium WebElement containing product data
|
||||
selectors: Dictionary of CSS selectors
|
||||
|
||||
Returns:
|
||||
Dictionary containing product data or None if extraction failed
|
||||
"""
|
||||
try:
|
||||
data = {}
|
||||
|
||||
# Extract name
|
||||
try:
|
||||
name_element = product.find_element(By.CSS_SELECTOR, selectors.get("name", ""))
|
||||
data["name"] = name_element.text.strip()
|
||||
except NoSuchElementException:
|
||||
data["name"] = None
|
||||
|
||||
# Extract price
|
||||
try:
|
||||
price_element = product.find_element(By.CSS_SELECTOR, selectors.get("price", ""))
|
||||
price_text = price_element.text.strip()
|
||||
data["price"] = self._parse_price(price_text)
|
||||
except NoSuchElementException:
|
||||
data["price"] = None
|
||||
|
||||
# Extract rating
|
||||
try:
|
||||
rating_element = product.find_element(By.CSS_SELECTOR, selectors.get("rating", ""))
|
||||
rating_text = rating_element.text.strip()
|
||||
data["rating"] = self._parse_rating(rating_text)
|
||||
except NoSuchElementException:
|
||||
data["rating"] = None
|
||||
|
||||
# Extract availability
|
||||
try:
|
||||
availability_element = product.find_element(By.CSS_SELECTOR, selectors.get("availability", ""))
|
||||
data["availability"] = availability_element.text.strip()
|
||||
except NoSuchElementException:
|
||||
data["availability"] = None
|
||||
|
||||
# Extract URL if available
|
||||
try:
|
||||
link_element = product.find_element(By.CSS_SELECTOR, "a")
|
||||
if link_element.get_attribute("href"):
|
||||
data["url"] = link_element.get_attribute("href")
|
||||
except NoSuchElementException:
|
||||
pass
|
||||
|
||||
return data if data.get("name") else None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error extracting product data with Selenium: {str(e)}")
|
||||
return None
|
||||
|
||||
def _parse_price(self, price_text: str) -> Optional[float]:
|
||||
"""
|
||||
Parse price text to extract numeric value.
|
||||
|
||||
Args:
|
||||
price_text: Raw price text
|
||||
|
||||
Returns:
|
||||
Parsed price as float or None if parsing failed
|
||||
"""
|
||||
try:
|
||||
# Remove currency symbols and whitespace
|
||||
price_clean = price_text.replace('$', '').replace('€', '').replace('£', '').strip()
|
||||
|
||||
# Handle comma as decimal separator
|
||||
price_clean = price_clean.replace(',', '.')
|
||||
|
||||
# Extract numeric part
|
||||
import re
|
||||
price_match = re.search(r'[\d.]+', price_clean)
|
||||
if price_match:
|
||||
return float(price_match.group())
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error parsing price '{price_text}': {str(e)}")
|
||||
return None
|
||||
|
||||
def _parse_rating(self, rating_text: str) -> Optional[float]:
|
||||
"""
|
||||
Parse rating text to extract numeric value.
|
||||
|
||||
Args:
|
||||
rating_text: Raw rating text
|
||||
|
||||
Returns:
|
||||
Parsed rating as float or None if parsing failed
|
||||
"""
|
||||
try:
|
||||
# Extract numeric part
|
||||
import re
|
||||
rating_match = re.search(r'[\d.]+', rating_text)
|
||||
if rating_match:
|
||||
return float(rating_match.group())
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error parsing rating '{rating_text}': {str(e)}")
|
||||
return None
|
||||
|
||||
def save_data(self, data: List[Dict[str, Any]], output_path: str) -> None:
|
||||
"""
|
||||
Save scraped data to file.
|
||||
|
||||
Args:
|
||||
data: List of dictionaries containing scraped data
|
||||
output_path: Path to output file
|
||||
"""
|
||||
try:
|
||||
output_file = Path(output_path)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Save based on file extension
|
||||
if output_path.endswith('.csv'):
|
||||
df.to_csv(output_path, index=False)
|
||||
elif output_path.endswith('.json'):
|
||||
df.to_json(output_path, orient='records', indent=2)
|
||||
elif output_path.endswith('.xlsx'):
|
||||
df.to_excel(output_path, index=False)
|
||||
else:
|
||||
# Default to CSV
|
||||
output_path = output_path + '.csv'
|
||||
df.to_csv(output_path, index=False)
|
||||
|
||||
self.logger.info(f"Data saved to {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving data to {output_path}: {str(e)}")
|
||||
raise
|
||||
|
||||
def analyze_trends(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
|
||||
"""
|
||||
Analyze market trends from scraped data.
|
||||
|
||||
Args:
|
||||
data: List of dictionaries containing scraped data
|
||||
|
||||
Returns:
|
||||
Dictionary containing trend analysis results
|
||||
"""
|
||||
try:
|
||||
if not data:
|
||||
return {"error": "No data available for analysis"}
|
||||
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Basic statistics
|
||||
analysis = {
|
||||
"total_products": len(df),
|
||||
"sources": df["source"].value_counts().to_dict(),
|
||||
"price_analysis": {},
|
||||
"rating_analysis": {},
|
||||
"availability_analysis": {}
|
||||
}
|
||||
|
||||
# Price analysis
|
||||
if "price" in df.columns:
|
||||
price_data = df["price"].dropna()
|
||||
if not price_data.empty:
|
||||
analysis["price_analysis"] = {
|
||||
"average_price": float(price_data.mean()),
|
||||
"min_price": float(price_data.min()),
|
||||
"max_price": float(price_data.max()),
|
||||
"median_price": float(price_data.median()),
|
||||
"price_distribution": price_data.describe().to_dict()
|
||||
}
|
||||
|
||||
# Rating analysis
|
||||
if "rating" in df.columns:
|
||||
rating_data = df["rating"].dropna()
|
||||
if not rating_data.empty:
|
||||
analysis["rating_analysis"] = {
|
||||
"average_rating": float(rating_data.mean()),
|
||||
"min_rating": float(rating_data.min()),
|
||||
"max_rating": float(rating_data.max()),
|
||||
"rating_distribution": rating_data.value_counts().to_dict()
|
||||
}
|
||||
|
||||
# Availability analysis
|
||||
if "availability" in df.columns:
|
||||
availability_data = df["availability"].dropna()
|
||||
if not availability_data.empty:
|
||||
analysis["availability_analysis"] = availability_data.value_counts().to_dict()
|
||||
|
||||
# Price trends by source
|
||||
if "price" in df.columns and "source" in df.columns:
|
||||
price_by_source = df.groupby("source")["price"].agg(["mean", "min", "max"]).to_dict()
|
||||
analysis["price_by_source"] = price_by_source
|
||||
|
||||
self.logger.info("Trend analysis completed")
|
||||
return analysis
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error analyzing trends: {str(e)}")
|
||||
return {"error": str(e)}
|
||||
|
||||
def save_analysis(self, analysis: Dict[str, Any], output_path: str) -> None:
|
||||
"""
|
||||
Save trend analysis results to file.
|
||||
|
||||
Args:
|
||||
analysis: Dictionary containing analysis results
|
||||
output_path: Path to output file
|
||||
"""
|
||||
try:
|
||||
output_file = Path(output_path)
|
||||
output_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(analysis, f, indent=2, ensure_ascii=False)
|
||||
|
||||
self.logger.info(f"Analysis saved to {output_path}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error saving analysis to {output_path}: {str(e)}")
|
||||
raise
|
||||
|
||||
def close(self) -> None:
|
||||
"""
|
||||
Close browser and session.
|
||||
"""
|
||||
try:
|
||||
if self.driver:
|
||||
self.driver.quit()
|
||||
self.driver = None
|
||||
|
||||
if self.session:
|
||||
self.session.close()
|
||||
|
||||
self.logger.info("Browser and session closed")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error closing browser/session: {str(e)}")
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry."""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit."""
|
||||
self.close()
|
Reference in New Issue
Block a user