Files
Martin c40d44e4c9 Add redirect tracking and analysis features
Database Schema:
- Added redirect_url VARCHAR(2048) to pages table
- Added redirect_count INT DEFAULT 0 to pages table
- Added index on redirect_count for faster queries

Configuration:
- Created Config class with typed constants (PHP 8.3+)
- MAX_REDIRECT_THRESHOLD = 3 (configurable warning threshold)
- MAX_CRAWL_DEPTH = 50
- CONCURRENCY = 10

Backend Changes:
- Crawler now tracks redirects using Guzzle's redirect tracking
- Extracts redirect history from response headers
- Records redirect count and final destination URL
- Guzzle configured with max 10 redirects and tracking enabled

API Endpoint:
- New endpoint: /api.php?action=redirects
- Analyzes redirect types (permanent 301/308 vs temporary 302/303/307)
- Identifies excessive redirects (> threshold)
- Returns statistics and detailed redirect information

Frontend Changes:
- Added "Redirects" tab with:
  * Statistics overview (Total, Permanent, Temporary, Excessive)
  * Detailed table showing all redirects
  * Visual warnings for excessive redirects (yellow background)
  * Color-coded redirect counts (red when > threshold)
  * Status code badges (green for permanent, blue for temporary)

All quality checks pass:
- PHPStan Level 8: 0 errors
- PHPCS PSR-12: 0 errors

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-04 09:40:26 +02:00

80 lines
2.9 KiB
SQL

/**
* Web Crawler - Database Schema
*
* @copyright Copyright (c) 2025 Martin Kiesewetter
* @author Martin Kiesewetter <mki@kies-media.de>
* @link https://kies-media.de
*/
-- Database initialization script for Web Crawler
-- Crawl Jobs Table
CREATE TABLE IF NOT EXISTS crawl_jobs (
id INT AUTO_INCREMENT PRIMARY KEY,
domain VARCHAR(255) NOT NULL,
status ENUM('pending', 'running', 'completed', 'failed') DEFAULT 'pending',
total_pages INT DEFAULT 0,
total_links INT DEFAULT 0,
started_at TIMESTAMP NULL,
completed_at TIMESTAMP NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
INDEX idx_domain (domain),
INDEX idx_status (status)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- Pages Table
CREATE TABLE IF NOT EXISTS pages (
id INT AUTO_INCREMENT PRIMARY KEY,
crawl_job_id INT NOT NULL,
url VARCHAR(2048) NOT NULL,
title VARCHAR(500),
meta_description TEXT,
status_code INT,
content_type VARCHAR(100),
redirect_url VARCHAR(2048),
redirect_count INT DEFAULT 0,
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_crawl_job (crawl_job_id),
INDEX idx_url (url(255)),
INDEX idx_status_code (status_code),
INDEX idx_redirect_count (redirect_count),
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- Links Table
CREATE TABLE IF NOT EXISTS links (
id INT AUTO_INCREMENT PRIMARY KEY,
page_id INT NOT NULL,
crawl_job_id INT NOT NULL,
source_url VARCHAR(2048) NOT NULL,
target_url VARCHAR(2048) NOT NULL,
link_text VARCHAR(1000),
is_nofollow BOOLEAN DEFAULT FALSE,
is_internal BOOLEAN DEFAULT TRUE,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (page_id) REFERENCES pages(id) ON DELETE CASCADE,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_page (page_id),
INDEX idx_crawl_job (crawl_job_id),
INDEX idx_source_url (source_url(255)),
INDEX idx_target_url (target_url(255)),
INDEX idx_nofollow (is_nofollow)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- Queue Table for parallel processing
CREATE TABLE IF NOT EXISTS crawl_queue (
id INT AUTO_INCREMENT PRIMARY KEY,
crawl_job_id INT NOT NULL,
url VARCHAR(2048) NOT NULL,
depth INT DEFAULT 0,
status ENUM('pending', 'processing', 'completed', 'failed') DEFAULT 'pending',
retry_count INT DEFAULT 0,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
processed_at TIMESTAMP NULL,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_status (status),
INDEX idx_crawl_job (crawl_job_id),
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;