Initial commit: Market Trends Scraper

This commit is contained in:
Dev
2025-09-11 17:46:14 +03:00
commit 4ddcde68d4
17 changed files with 3049 additions and 0 deletions

355
tests/test_integration.py Normal file
View File

@@ -0,0 +1,355 @@
"""
Integration tests for the Market Trends Scraper application.
These tests verify that all components work together correctly.
"""
import pytest
import tempfile
import os
import json
from pathlib import Path
from unittest.mock import patch, Mock
from src.config_manager import ConfigManager
from src.scraper import MarketTrendsScraper
from src.logger import setup_logger
import main
class TestIntegration:
"""Integration test cases for the entire application."""
@pytest.fixture
def temp_dir(self):
"""Create a temporary directory for test files."""
with tempfile.TemporaryDirectory() as temp_dir:
yield temp_dir
@pytest.fixture
def sample_config_file(self, temp_dir):
"""Create a sample configuration file for testing."""
config_path = Path(temp_dir) / "config.yaml"
config_content = {
"scraper": {
"delay_between_requests": 0.1, # Faster for testing
"timeout": 10,
"max_retries": 2,
"user_agent": "Mozilla/5.0 (Test)",
"headless": True,
"window_size": [1024, 768]
},
"sources": [
{
"name": "test_source",
"url": "https://httpbin.org/html",
"type": "ecommerce",
"enabled": True,
"selectors": {
"product": "p",
"name": "p",
"price": "p",
"rating": "p",
"availability": "p"
},
"pagination": {
"next_page": "a",
"max_pages": 1
}
}
],
"output": {
"format": "csv",
"include_timestamp": True,
"filename": "test_output"
},
"database": {
"url": f"sqlite:///{temp_dir}/test.db",
"echo": False
},
"analysis": {
"price_history_days": 30,
"trend_threshold": 0.05,
"generate_charts": True
}
}
import yaml
with open(config_path, 'w') as f:
yaml.dump(config_content, f)
return str(config_path)
@patch('src.scraper.webdriver')
def test_full_workflow(self, mock_webdriver, sample_config_file, temp_dir):
"""Test the complete workflow from config loading to analysis."""
# Setup mock driver
mock_driver = Mock()
mock_webdriver.Chrome.return_value = mock_driver
# Setup logger
setup_logger()
# Load configuration
config_manager = ConfigManager(sample_config_file)
config = config_manager.load_config()
# Verify config was loaded
assert config["scraper"]["delay_between_requests"] == 0.1
assert len(config["sources"]) == 1
assert config["sources"][0]["name"] == "test_source"
# Initialize scraper
scraper = MarketTrendsScraper(config, headless=True)
# Mock the scraping process to return sample data
sample_data = [
{
"name": "Test Product 1",
"price": 19.99,
"rating": 4.5,
"availability": "In Stock",
"source": "test_source",
"scraped_at": "2023-01-01T00:00:00"
},
{
"name": "Test Product 2",
"price": 29.99,
"rating": 3.8,
"availability": "Out of Stock",
"source": "test_source",
"scraped_at": "2023-01-01T00:00:00"
}
]
with patch.object(scraper, '_scrape_source', return_value=sample_data):
# Scrape data
data = scraper.scrape_market_trends()
# Verify data was scraped
assert len(data) == 2
assert data[0]["name"] == "Test Product 1"
assert data[1]["price"] == 29.99
# Save data
output_path = Path(temp_dir) / "test_output.csv"
scraper.save_data(data, str(output_path))
# Verify file was created
assert output_path.exists()
# Analyze trends
analysis = scraper.analyze_trends(data)
# Verify analysis
assert analysis["total_products"] == 2
assert "price_analysis" in analysis
assert analysis["price_analysis"]["average_price"] == 24.99
assert analysis["price_analysis"]["min_price"] == 19.99
assert analysis["price_analysis"]["max_price"] == 29.99
# Save analysis
analysis_path = Path(temp_dir) / "test_analysis.json"
scraper.save_analysis(analysis, str(analysis_path))
# Verify analysis file was created
assert analysis_path.exists()
# Verify analysis content
with open(analysis_path, 'r') as f:
saved_analysis = json.load(f)
assert saved_analysis["total_products"] == 2
assert saved_analysis["price_analysis"]["average_price"] == 24.99
@patch('src.scraper.webdriver')
def test_multiple_sources(self, mock_webdriver, temp_dir):
"""Test scraping from multiple sources."""
# Setup mock driver
mock_driver = Mock()
mock_webdriver.Chrome.return_value = mock_driver
# Create config with multiple sources
config_content = {
"scraper": {
"delay_between_requests": 0.1,
"timeout": 10,
"headless": True
},
"sources": [
{
"name": "source_1",
"url": "https://example1.com",
"enabled": True,
"selectors": {
"product": "div.product",
"name": "h2",
"price": "span.price"
}
},
{
"name": "source_2",
"url": "https://example2.com",
"enabled": True,
"selectors": {
"product": "div.item",
"name": "h3",
"price": "div.cost"
}
},
{
"name": "source_3",
"url": "https://example3.com",
"enabled": False, # Disabled source
"selectors": {
"product": "div.product",
"name": "h2",
"price": "span.price"
}
}
]
}
import yaml
config_path = Path(temp_dir) / "multi_source_config.yaml"
with open(config_path, 'w') as f:
yaml.dump(config_content, f)
# Initialize scraper
scraper = MarketTrendsScraper(config_content, headless=True)
# Mock different data for each source
def mock_scrape_source(source):
if source["name"] == "source_1":
return [{"name": "Product 1", "price": 10.00, "source": "source_1"}]
elif source["name"] == "source_2":
return [{"name": "Product 2", "price": 20.00, "source": "source_2"}]
else:
return []
with patch.object(scraper, '_scrape_source', side_effect=mock_scrape_source):
data = scraper.scrape_market_trends()
# Verify data from both enabled sources
assert len(data) == 2
sources = {item["source"] for item in data}
assert "source_1" in sources
assert "source_2" in sources
assert "source_3" not in sources # Disabled source should not appear
@patch('src.scraper.webdriver')
def test_error_handling(self, mock_webdriver, temp_dir):
"""Test error handling when scraping fails."""
# Setup mock driver
mock_driver = Mock()
mock_webdriver.Chrome.return_value = mock_driver
config = {
"scraper": {
"delay_between_requests": 0.1,
"timeout": 10,
"headless": True
},
"sources": [
{
"name": "working_source",
"url": "https://example.com",
"enabled": True,
"selectors": {
"product": "div.product",
"name": "h2",
"price": "span.price"
}
},
{
"name": "failing_source",
"url": "https://example.com",
"enabled": True,
"selectors": {
"product": "div.product",
"name": "h2",
"price": "span.price"
}
}
]
}
# Initialize scraper
scraper = MarketTrendsScraper(config, headless=True)
# Mock one source to succeed and one to fail
def mock_scrape_source(source):
if source["name"] == "working_source":
return [{"name": "Working Product", "price": 15.00, "source": "working_source"}]
else:
raise Exception("Scraping failed")
with patch.object(scraper, '_scrape_source', side_effect=mock_scrape_source):
data = scraper.scrape_market_trends()
# Should still get data from working source
assert len(data) == 1
assert data[0]["source"] == "working_source"
@patch('sys.argv', ['main.py', '--config', 'test_config.yaml', '--output', 'test_output.csv'])
@patch('src.scraper.webdriver')
def test_main_entry_point(self, mock_webdriver, temp_dir):
"""Test the main entry point of the application."""
# Setup mock driver
mock_driver = Mock()
mock_webdriver.Chrome.return_value = mock_driver
# Create test config
config_path = Path(temp_dir) / "test_config.yaml"
config_content = {
"scraper": {
"delay_between_requests": 0.1,
"timeout": 10,
"headless": True
},
"sources": [
{
"name": "test_source",
"url": "https://example.com",
"enabled": True,
"selectors": {
"product": "div.product",
"name": "h2",
"price": "span.price"
}
}
]
}
import yaml
with open(config_path, 'w') as f:
yaml.dump(config_content, f)
# Mock the scraper to return sample data
sample_data = [{"name": "Test Product", "price": 19.99, "source": "test_source"}]
with patch('main.ConfigManager') as mock_config_manager, \
patch('main.MarketTrendsScraper') as mock_scraper_class:
# Setup mocks
mock_config_instance = Mock()
mock_config_manager.return_value = mock_config_instance
mock_config_instance.load_config.return_value = config_content
mock_scraper_instance = Mock()
mock_scraper_class.return_value = mock_scraper_instance
mock_scraper_instance.scrape_market_trends.return_value = sample_data
mock_scraper_instance.analyze_trends.return_value = {"total_products": 1}
# Run main function
with patch('sys.argv', ['main.py', '--config', str(config_path), '--output', str(temp_dir / 'output.csv')]):
result = main.main()
# Verify main completed successfully
assert result == 0
# Verify scraper was called
mock_scraper_instance.scrape_market_trends.assert_called_once()
mock_scraper_instance.save_data.assert_called_once()
mock_scraper_instance.analyze_trends.assert_called_once()
mock_scraper_instance.save_analysis.assert_called_once()