diff --git a/config/docker/init.sql b/config/docker/init.sql index 02cf861..4f08552 100644 --- a/config/docker/init.sql +++ b/config/docker/init.sql @@ -31,11 +31,14 @@ CREATE TABLE IF NOT EXISTS pages ( meta_description TEXT, status_code INT, content_type VARCHAR(100), + redirect_url VARCHAR(2048), + redirect_count INT DEFAULT 0, crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE, INDEX idx_crawl_job (crawl_job_id), INDEX idx_url (url(255)), INDEX idx_status_code (status_code), + INDEX idx_redirect_count (redirect_count), UNIQUE KEY unique_job_url (crawl_job_id, url(255)) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; diff --git a/src/api.php b/src/api.php index b5e3b42..5407408 100644 --- a/src/api.php +++ b/src/api.php @@ -217,6 +217,47 @@ try { ]); break; + case 'redirects': + $jobId = $_GET['job_id'] ?? 0; + $stmt = $db->prepare( + "SELECT url, title, status_code, redirect_url, redirect_count FROM pages " . + "WHERE crawl_job_id = ? AND redirect_count > 0 " . + "ORDER BY redirect_count DESC, url" + ); + $stmt->execute([$jobId]); + $redirects = $stmt->fetchAll(); + + // Count redirect types + $permanent = 0; + $temporary = 0; + $excessive = 0; + $maxThreshold = 3; // From Config::MAX_REDIRECT_THRESHOLD + + foreach ($redirects as $redirect) { + $code = $redirect['status_code']; + if ($code == 301 || $code == 308) { + $permanent++; + } elseif ($code == 302 || $code == 303 || $code == 307) { + $temporary++; + } + if ($redirect['redirect_count'] > $maxThreshold) { + $excessive++; + } + } + + echo json_encode([ + 'success' => true, + 'redirects' => $redirects, + 'stats' => [ + 'total' => count($redirects), + 'permanent' => $permanent, + 'temporary' => $temporary, + 'excessive' => $excessive, + 'threshold' => $maxThreshold + ] + ]); + break; + case 'delete': $jobId = $_POST['job_id'] ?? 0; $stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?"); diff --git a/src/classes/Config.php b/src/classes/Config.php new file mode 100644 index 0000000..4a77325 --- /dev/null +++ b/src/classes/Config.php @@ -0,0 +1,29 @@ + + * @link https://kies-media.de + */ + +namespace App; + +class Config +{ + /** + * Maximum number of redirects before warning + */ + public const int MAX_REDIRECT_THRESHOLD = 3; + + /** + * Maximum crawl depth + */ + public const int MAX_CRAWL_DEPTH = 50; + + /** + * Number of parallel requests + */ + public const int CONCURRENCY = 10; +} diff --git a/src/classes/Crawler.php b/src/classes/Crawler.php index c14cdc6..5251ecf 100644 --- a/src/classes/Crawler.php +++ b/src/classes/Crawler.php @@ -33,6 +33,10 @@ class Crawler $this->client = new Client([ 'timeout' => 30, 'verify' => false, + 'allow_redirects' => [ + 'max' => 10, + 'track_redirects' => true + ], 'headers' => [ 'User-Agent' => 'WebCrawler/1.0' ] @@ -144,6 +148,17 @@ class Crawler $contentType = $response->getHeaderLine('Content-Type'); $body = $response->getBody()->getContents(); + // Track redirects + $redirectUrl = null; + $redirectCount = 0; + if ($response->hasHeader('X-Guzzle-Redirect-History')) { + $redirectHistory = $response->getHeader('X-Guzzle-Redirect-History'); + $redirectCount = count($redirectHistory); + if ($redirectCount > 0) { + $redirectUrl = end($redirectHistory); + } + } + // Save page $domCrawler = new DomCrawler($body, $url); $title = $domCrawler->filter('title')->count() > 0 @@ -155,13 +170,24 @@ class Crawler : ''; $stmt = $this->db->prepare( - "INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, content_type) " . - "VALUES (?, ?, ?, ?, ?, ?) " . + "INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, " . + "content_type, redirect_url, redirect_count) " . + "VALUES (?, ?, ?, ?, ?, ?, ?, ?) " . "ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code), " . - "meta_description = VALUES(meta_description)" + "meta_description = VALUES(meta_description), redirect_url = VALUES(redirect_url), " . + "redirect_count = VALUES(redirect_count)" ); - $stmt->execute([$this->crawlJobId, $url, $title, $metaDescription, $statusCode, $contentType]); + $stmt->execute([ + $this->crawlJobId, + $url, + $title, + $metaDescription, + $statusCode, + $contentType, + $redirectUrl, + $redirectCount + ]); $pageId = $this->db->lastInsertId(); // If pageId is 0, fetch it manually diff --git a/src/index.php b/src/index.php index b61f221..934a477 100644 --- a/src/index.php +++ b/src/index.php @@ -251,6 +251,7 @@ + @@ -303,6 +304,25 @@ +
+

Redirect Statistics

+
+ + + + + + + + + + + + + +
URLRedirect ToStatus CodeRedirect CountType
Keine Redirects gefunden
+
+

SEO Issues

@@ -549,6 +569,56 @@ } } + // Load redirects + const redirectsResponse = await fetch(`/api.php?action=redirects&job_id=${currentJobId}`); + const redirectsData = await redirectsResponse.json(); + + if (redirectsData.success) { + const stats = redirectsData.stats; + + // Redirect Stats + document.getElementById('redirectStats').innerHTML = ` +
+
Total Redirects
+
${stats.total}
+
+
+
Permanent (301/308)
+
${stats.permanent}
+
+
+
Temporary (302/303/307)
+
${stats.temporary}
+
+
+
Excessive (>${stats.threshold})
+
${stats.excessive}
+
threshold: ${stats.threshold}
+
+ `; + + // Redirect Table + if (redirectsData.redirects.length > 0) { + document.getElementById('redirectsBody').innerHTML = redirectsData.redirects.map(redirect => { + const isExcessive = redirect.redirect_count > stats.threshold; + const isPermRedirect = redirect.status_code == 301 || redirect.status_code == 308; + const redirectType = isPermRedirect ? 'Permanent' : 'Temporary'; + + return ` + + ${redirect.url} + ${redirect.redirect_url || '-'} + ${redirect.status_code} + ${redirect.redirect_count} + ${redirectType} + + `; + }).join(''); + } else { + document.getElementById('redirectsBody').innerHTML = 'Keine Redirects gefunden'; + } + } + // Update jobs table loadJobs(); } catch (e) {