Add redirect tracking and analysis features
Database Schema: - Added redirect_url VARCHAR(2048) to pages table - Added redirect_count INT DEFAULT 0 to pages table - Added index on redirect_count for faster queries Configuration: - Created Config class with typed constants (PHP 8.3+) - MAX_REDIRECT_THRESHOLD = 3 (configurable warning threshold) - MAX_CRAWL_DEPTH = 50 - CONCURRENCY = 10 Backend Changes: - Crawler now tracks redirects using Guzzle's redirect tracking - Extracts redirect history from response headers - Records redirect count and final destination URL - Guzzle configured with max 10 redirects and tracking enabled API Endpoint: - New endpoint: /api.php?action=redirects - Analyzes redirect types (permanent 301/308 vs temporary 302/303/307) - Identifies excessive redirects (> threshold) - Returns statistics and detailed redirect information Frontend Changes: - Added "Redirects" tab with: * Statistics overview (Total, Permanent, Temporary, Excessive) * Detailed table showing all redirects * Visual warnings for excessive redirects (yellow background) * Color-coded redirect counts (red when > threshold) * Status code badges (green for permanent, blue for temporary) All quality checks pass: - PHPStan Level 8: 0 errors - PHPCS PSR-12: 0 errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -31,11 +31,14 @@ CREATE TABLE IF NOT EXISTS pages (
|
||||
meta_description TEXT,
|
||||
status_code INT,
|
||||
content_type VARCHAR(100),
|
||||
redirect_url VARCHAR(2048),
|
||||
redirect_count INT DEFAULT 0,
|
||||
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
|
||||
INDEX idx_crawl_job (crawl_job_id),
|
||||
INDEX idx_url (url(255)),
|
||||
INDEX idx_status_code (status_code),
|
||||
INDEX idx_redirect_count (redirect_count),
|
||||
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
|
||||
41
src/api.php
41
src/api.php
@@ -217,6 +217,47 @@ try {
|
||||
]);
|
||||
break;
|
||||
|
||||
case 'redirects':
|
||||
$jobId = $_GET['job_id'] ?? 0;
|
||||
$stmt = $db->prepare(
|
||||
"SELECT url, title, status_code, redirect_url, redirect_count FROM pages " .
|
||||
"WHERE crawl_job_id = ? AND redirect_count > 0 " .
|
||||
"ORDER BY redirect_count DESC, url"
|
||||
);
|
||||
$stmt->execute([$jobId]);
|
||||
$redirects = $stmt->fetchAll();
|
||||
|
||||
// Count redirect types
|
||||
$permanent = 0;
|
||||
$temporary = 0;
|
||||
$excessive = 0;
|
||||
$maxThreshold = 3; // From Config::MAX_REDIRECT_THRESHOLD
|
||||
|
||||
foreach ($redirects as $redirect) {
|
||||
$code = $redirect['status_code'];
|
||||
if ($code == 301 || $code == 308) {
|
||||
$permanent++;
|
||||
} elseif ($code == 302 || $code == 303 || $code == 307) {
|
||||
$temporary++;
|
||||
}
|
||||
if ($redirect['redirect_count'] > $maxThreshold) {
|
||||
$excessive++;
|
||||
}
|
||||
}
|
||||
|
||||
echo json_encode([
|
||||
'success' => true,
|
||||
'redirects' => $redirects,
|
||||
'stats' => [
|
||||
'total' => count($redirects),
|
||||
'permanent' => $permanent,
|
||||
'temporary' => $temporary,
|
||||
'excessive' => $excessive,
|
||||
'threshold' => $maxThreshold
|
||||
]
|
||||
]);
|
||||
break;
|
||||
|
||||
case 'delete':
|
||||
$jobId = $_POST['job_id'] ?? 0;
|
||||
$stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?");
|
||||
|
||||
29
src/classes/Config.php
Normal file
29
src/classes/Config.php
Normal file
@@ -0,0 +1,29 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Web Crawler - Configuration Class
|
||||
*
|
||||
* @copyright Copyright (c) 2025 Martin Kiesewetter
|
||||
* @author Martin Kiesewetter <mki@kies-media.de>
|
||||
* @link https://kies-media.de
|
||||
*/
|
||||
|
||||
namespace App;
|
||||
|
||||
class Config
|
||||
{
|
||||
/**
|
||||
* Maximum number of redirects before warning
|
||||
*/
|
||||
public const int MAX_REDIRECT_THRESHOLD = 3;
|
||||
|
||||
/**
|
||||
* Maximum crawl depth
|
||||
*/
|
||||
public const int MAX_CRAWL_DEPTH = 50;
|
||||
|
||||
/**
|
||||
* Number of parallel requests
|
||||
*/
|
||||
public const int CONCURRENCY = 10;
|
||||
}
|
||||
@@ -33,6 +33,10 @@ class Crawler
|
||||
$this->client = new Client([
|
||||
'timeout' => 30,
|
||||
'verify' => false,
|
||||
'allow_redirects' => [
|
||||
'max' => 10,
|
||||
'track_redirects' => true
|
||||
],
|
||||
'headers' => [
|
||||
'User-Agent' => 'WebCrawler/1.0'
|
||||
]
|
||||
@@ -144,6 +148,17 @@ class Crawler
|
||||
$contentType = $response->getHeaderLine('Content-Type');
|
||||
$body = $response->getBody()->getContents();
|
||||
|
||||
// Track redirects
|
||||
$redirectUrl = null;
|
||||
$redirectCount = 0;
|
||||
if ($response->hasHeader('X-Guzzle-Redirect-History')) {
|
||||
$redirectHistory = $response->getHeader('X-Guzzle-Redirect-History');
|
||||
$redirectCount = count($redirectHistory);
|
||||
if ($redirectCount > 0) {
|
||||
$redirectUrl = end($redirectHistory);
|
||||
}
|
||||
}
|
||||
|
||||
// Save page
|
||||
$domCrawler = new DomCrawler($body, $url);
|
||||
$title = $domCrawler->filter('title')->count() > 0
|
||||
@@ -155,13 +170,24 @@ class Crawler
|
||||
: '';
|
||||
|
||||
$stmt = $this->db->prepare(
|
||||
"INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, content_type) " .
|
||||
"VALUES (?, ?, ?, ?, ?, ?) " .
|
||||
"INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, " .
|
||||
"content_type, redirect_url, redirect_count) " .
|
||||
"VALUES (?, ?, ?, ?, ?, ?, ?, ?) " .
|
||||
"ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code), " .
|
||||
"meta_description = VALUES(meta_description)"
|
||||
"meta_description = VALUES(meta_description), redirect_url = VALUES(redirect_url), " .
|
||||
"redirect_count = VALUES(redirect_count)"
|
||||
);
|
||||
|
||||
$stmt->execute([$this->crawlJobId, $url, $title, $metaDescription, $statusCode, $contentType]);
|
||||
$stmt->execute([
|
||||
$this->crawlJobId,
|
||||
$url,
|
||||
$title,
|
||||
$metaDescription,
|
||||
$statusCode,
|
||||
$contentType,
|
||||
$redirectUrl,
|
||||
$redirectCount
|
||||
]);
|
||||
$pageId = $this->db->lastInsertId();
|
||||
|
||||
// If pageId is 0, fetch it manually
|
||||
|
||||
@@ -251,6 +251,7 @@
|
||||
<button class="tab active" onclick="switchTab('pages')">Seiten</button>
|
||||
<button class="tab" onclick="switchTab('links')">Links</button>
|
||||
<button class="tab" onclick="switchTab('broken')">Broken Links</button>
|
||||
<button class="tab" onclick="switchTab('redirects')">Redirects</button>
|
||||
<button class="tab" onclick="switchTab('seo')">SEO Analysis</button>
|
||||
</div>
|
||||
|
||||
@@ -303,6 +304,25 @@
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="redirects-tab">
|
||||
<h3>Redirect Statistics</h3>
|
||||
<div id="redirectStats" class="stats" style="margin-bottom: 20px;"></div>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>URL</th>
|
||||
<th>Redirect To</th>
|
||||
<th>Status Code</th>
|
||||
<th>Redirect Count</th>
|
||||
<th>Type</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="redirectsBody">
|
||||
<tr><td colspan="5" class="loading">Keine Redirects gefunden</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="seo-tab">
|
||||
<h3>SEO Issues</h3>
|
||||
<div id="seoStats" style="margin-bottom: 20px;"></div>
|
||||
@@ -549,6 +569,56 @@
|
||||
}
|
||||
}
|
||||
|
||||
// Load redirects
|
||||
const redirectsResponse = await fetch(`/api.php?action=redirects&job_id=${currentJobId}`);
|
||||
const redirectsData = await redirectsResponse.json();
|
||||
|
||||
if (redirectsData.success) {
|
||||
const stats = redirectsData.stats;
|
||||
|
||||
// Redirect Stats
|
||||
document.getElementById('redirectStats').innerHTML = `
|
||||
<div class="stat-box">
|
||||
<div class="stat-label">Total Redirects</div>
|
||||
<div class="stat-value">${stats.total}</div>
|
||||
</div>
|
||||
<div class="stat-box">
|
||||
<div class="stat-label">Permanent (301/308)</div>
|
||||
<div class="stat-value">${stats.permanent}</div>
|
||||
</div>
|
||||
<div class="stat-box">
|
||||
<div class="stat-label">Temporary (302/303/307)</div>
|
||||
<div class="stat-value">${stats.temporary}</div>
|
||||
</div>
|
||||
<div class="stat-box">
|
||||
<div class="stat-label">Excessive (>${stats.threshold})</div>
|
||||
<div class="stat-value" style="color: ${stats.excessive > 0 ? '#e74c3c' : '#27ae60'}">${stats.excessive}</div>
|
||||
<div class="stat-sublabel">threshold: ${stats.threshold}</div>
|
||||
</div>
|
||||
`;
|
||||
|
||||
// Redirect Table
|
||||
if (redirectsData.redirects.length > 0) {
|
||||
document.getElementById('redirectsBody').innerHTML = redirectsData.redirects.map(redirect => {
|
||||
const isExcessive = redirect.redirect_count > stats.threshold;
|
||||
const isPermRedirect = redirect.status_code == 301 || redirect.status_code == 308;
|
||||
const redirectType = isPermRedirect ? 'Permanent' : 'Temporary';
|
||||
|
||||
return `
|
||||
<tr style="${isExcessive ? 'background-color: #fff3cd;' : ''}">
|
||||
<td class="url-cell" title="${redirect.url}">${redirect.url}</td>
|
||||
<td class="url-cell" title="${redirect.redirect_url || '-'}">${redirect.redirect_url || '-'}</td>
|
||||
<td><span class="status ${isPermRedirect ? 'completed' : 'running'}">${redirect.status_code}</span></td>
|
||||
<td><strong ${isExcessive ? 'style="color: #e74c3c;"' : ''}>${redirect.redirect_count}</strong></td>
|
||||
<td>${redirectType}</td>
|
||||
</tr>
|
||||
`;
|
||||
}).join('');
|
||||
} else {
|
||||
document.getElementById('redirectsBody').innerHTML = '<tr><td colspan="5" class="loading">Keine Redirects gefunden</td></tr>';
|
||||
}
|
||||
}
|
||||
|
||||
// Update jobs table
|
||||
loadJobs();
|
||||
} catch (e) {
|
||||
|
||||
Reference in New Issue
Block a user