Add redirect tracking and analysis features

Database Schema:
- Added redirect_url VARCHAR(2048) to pages table
- Added redirect_count INT DEFAULT 0 to pages table
- Added index on redirect_count for faster queries

Configuration:
- Created Config class with typed constants (PHP 8.3+)
- MAX_REDIRECT_THRESHOLD = 3 (configurable warning threshold)
- MAX_CRAWL_DEPTH = 50
- CONCURRENCY = 10

Backend Changes:
- Crawler now tracks redirects using Guzzle's redirect tracking
- Extracts redirect history from response headers
- Records redirect count and final destination URL
- Guzzle configured with max 10 redirects and tracking enabled

API Endpoint:
- New endpoint: /api.php?action=redirects
- Analyzes redirect types (permanent 301/308 vs temporary 302/303/307)
- Identifies excessive redirects (> threshold)
- Returns statistics and detailed redirect information

Frontend Changes:
- Added "Redirects" tab with:
  * Statistics overview (Total, Permanent, Temporary, Excessive)
  * Detailed table showing all redirects
  * Visual warnings for excessive redirects (yellow background)
  * Color-coded redirect counts (red when > threshold)
  * Status code badges (green for permanent, blue for temporary)

All quality checks pass:
- PHPStan Level 8: 0 errors
- PHPCS PSR-12: 0 errors

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-04 09:40:26 +02:00
parent e6b75410ed
commit c40d44e4c9
5 changed files with 173 additions and 4 deletions

View File

@@ -31,11 +31,14 @@ CREATE TABLE IF NOT EXISTS pages (
meta_description TEXT, meta_description TEXT,
status_code INT, status_code INT,
content_type VARCHAR(100), content_type VARCHAR(100),
redirect_url VARCHAR(2048),
redirect_count INT DEFAULT 0,
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE, FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_crawl_job (crawl_job_id), INDEX idx_crawl_job (crawl_job_id),
INDEX idx_url (url(255)), INDEX idx_url (url(255)),
INDEX idx_status_code (status_code), INDEX idx_status_code (status_code),
INDEX idx_redirect_count (redirect_count),
UNIQUE KEY unique_job_url (crawl_job_id, url(255)) UNIQUE KEY unique_job_url (crawl_job_id, url(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

View File

@@ -217,6 +217,47 @@ try {
]); ]);
break; break;
case 'redirects':
$jobId = $_GET['job_id'] ?? 0;
$stmt = $db->prepare(
"SELECT url, title, status_code, redirect_url, redirect_count FROM pages " .
"WHERE crawl_job_id = ? AND redirect_count > 0 " .
"ORDER BY redirect_count DESC, url"
);
$stmt->execute([$jobId]);
$redirects = $stmt->fetchAll();
// Count redirect types
$permanent = 0;
$temporary = 0;
$excessive = 0;
$maxThreshold = 3; // From Config::MAX_REDIRECT_THRESHOLD
foreach ($redirects as $redirect) {
$code = $redirect['status_code'];
if ($code == 301 || $code == 308) {
$permanent++;
} elseif ($code == 302 || $code == 303 || $code == 307) {
$temporary++;
}
if ($redirect['redirect_count'] > $maxThreshold) {
$excessive++;
}
}
echo json_encode([
'success' => true,
'redirects' => $redirects,
'stats' => [
'total' => count($redirects),
'permanent' => $permanent,
'temporary' => $temporary,
'excessive' => $excessive,
'threshold' => $maxThreshold
]
]);
break;
case 'delete': case 'delete':
$jobId = $_POST['job_id'] ?? 0; $jobId = $_POST['job_id'] ?? 0;
$stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?"); $stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?");

29
src/classes/Config.php Normal file
View File

@@ -0,0 +1,29 @@
<?php
/**
* Web Crawler - Configuration Class
*
* @copyright Copyright (c) 2025 Martin Kiesewetter
* @author Martin Kiesewetter <mki@kies-media.de>
* @link https://kies-media.de
*/
namespace App;
class Config
{
/**
* Maximum number of redirects before warning
*/
public const int MAX_REDIRECT_THRESHOLD = 3;
/**
* Maximum crawl depth
*/
public const int MAX_CRAWL_DEPTH = 50;
/**
* Number of parallel requests
*/
public const int CONCURRENCY = 10;
}

View File

@@ -33,6 +33,10 @@ class Crawler
$this->client = new Client([ $this->client = new Client([
'timeout' => 30, 'timeout' => 30,
'verify' => false, 'verify' => false,
'allow_redirects' => [
'max' => 10,
'track_redirects' => true
],
'headers' => [ 'headers' => [
'User-Agent' => 'WebCrawler/1.0' 'User-Agent' => 'WebCrawler/1.0'
] ]
@@ -144,6 +148,17 @@ class Crawler
$contentType = $response->getHeaderLine('Content-Type'); $contentType = $response->getHeaderLine('Content-Type');
$body = $response->getBody()->getContents(); $body = $response->getBody()->getContents();
// Track redirects
$redirectUrl = null;
$redirectCount = 0;
if ($response->hasHeader('X-Guzzle-Redirect-History')) {
$redirectHistory = $response->getHeader('X-Guzzle-Redirect-History');
$redirectCount = count($redirectHistory);
if ($redirectCount > 0) {
$redirectUrl = end($redirectHistory);
}
}
// Save page // Save page
$domCrawler = new DomCrawler($body, $url); $domCrawler = new DomCrawler($body, $url);
$title = $domCrawler->filter('title')->count() > 0 $title = $domCrawler->filter('title')->count() > 0
@@ -155,13 +170,24 @@ class Crawler
: ''; : '';
$stmt = $this->db->prepare( $stmt = $this->db->prepare(
"INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, content_type) " . "INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, " .
"VALUES (?, ?, ?, ?, ?, ?) " . "content_type, redirect_url, redirect_count) " .
"VALUES (?, ?, ?, ?, ?, ?, ?, ?) " .
"ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code), " . "ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code), " .
"meta_description = VALUES(meta_description)" "meta_description = VALUES(meta_description), redirect_url = VALUES(redirect_url), " .
"redirect_count = VALUES(redirect_count)"
); );
$stmt->execute([$this->crawlJobId, $url, $title, $metaDescription, $statusCode, $contentType]); $stmt->execute([
$this->crawlJobId,
$url,
$title,
$metaDescription,
$statusCode,
$contentType,
$redirectUrl,
$redirectCount
]);
$pageId = $this->db->lastInsertId(); $pageId = $this->db->lastInsertId();
// If pageId is 0, fetch it manually // If pageId is 0, fetch it manually

View File

@@ -251,6 +251,7 @@
<button class="tab active" onclick="switchTab('pages')">Seiten</button> <button class="tab active" onclick="switchTab('pages')">Seiten</button>
<button class="tab" onclick="switchTab('links')">Links</button> <button class="tab" onclick="switchTab('links')">Links</button>
<button class="tab" onclick="switchTab('broken')">Broken Links</button> <button class="tab" onclick="switchTab('broken')">Broken Links</button>
<button class="tab" onclick="switchTab('redirects')">Redirects</button>
<button class="tab" onclick="switchTab('seo')">SEO Analysis</button> <button class="tab" onclick="switchTab('seo')">SEO Analysis</button>
</div> </div>
@@ -303,6 +304,25 @@
</table> </table>
</div> </div>
<div class="tab-content" id="redirects-tab">
<h3>Redirect Statistics</h3>
<div id="redirectStats" class="stats" style="margin-bottom: 20px;"></div>
<table>
<thead>
<tr>
<th>URL</th>
<th>Redirect To</th>
<th>Status Code</th>
<th>Redirect Count</th>
<th>Type</th>
</tr>
</thead>
<tbody id="redirectsBody">
<tr><td colspan="5" class="loading">Keine Redirects gefunden</td></tr>
</tbody>
</table>
</div>
<div class="tab-content" id="seo-tab"> <div class="tab-content" id="seo-tab">
<h3>SEO Issues</h3> <h3>SEO Issues</h3>
<div id="seoStats" style="margin-bottom: 20px;"></div> <div id="seoStats" style="margin-bottom: 20px;"></div>
@@ -549,6 +569,56 @@
} }
} }
// Load redirects
const redirectsResponse = await fetch(`/api.php?action=redirects&job_id=${currentJobId}`);
const redirectsData = await redirectsResponse.json();
if (redirectsData.success) {
const stats = redirectsData.stats;
// Redirect Stats
document.getElementById('redirectStats').innerHTML = `
<div class="stat-box">
<div class="stat-label">Total Redirects</div>
<div class="stat-value">${stats.total}</div>
</div>
<div class="stat-box">
<div class="stat-label">Permanent (301/308)</div>
<div class="stat-value">${stats.permanent}</div>
</div>
<div class="stat-box">
<div class="stat-label">Temporary (302/303/307)</div>
<div class="stat-value">${stats.temporary}</div>
</div>
<div class="stat-box">
<div class="stat-label">Excessive (>${stats.threshold})</div>
<div class="stat-value" style="color: ${stats.excessive > 0 ? '#e74c3c' : '#27ae60'}">${stats.excessive}</div>
<div class="stat-sublabel">threshold: ${stats.threshold}</div>
</div>
`;
// Redirect Table
if (redirectsData.redirects.length > 0) {
document.getElementById('redirectsBody').innerHTML = redirectsData.redirects.map(redirect => {
const isExcessive = redirect.redirect_count > stats.threshold;
const isPermRedirect = redirect.status_code == 301 || redirect.status_code == 308;
const redirectType = isPermRedirect ? 'Permanent' : 'Temporary';
return `
<tr style="${isExcessive ? 'background-color: #fff3cd;' : ''}">
<td class="url-cell" title="${redirect.url}">${redirect.url}</td>
<td class="url-cell" title="${redirect.redirect_url || '-'}">${redirect.redirect_url || '-'}</td>
<td><span class="status ${isPermRedirect ? 'completed' : 'running'}">${redirect.status_code}</span></td>
<td><strong ${isExcessive ? 'style="color: #e74c3c;"' : ''}>${redirect.redirect_count}</strong></td>
<td>${redirectType}</td>
</tr>
`;
}).join('');
} else {
document.getElementById('redirectsBody').innerHTML = '<tr><td colspan="5" class="loading">Keine Redirects gefunden</td></tr>';
}
}
// Update jobs table // Update jobs table
loadJobs(); loadJobs();
} catch (e) { } catch (e) {