diff --git a/config/docker/init.sql b/config/docker/init.sql index 02cf861..4f08552 100644 --- a/config/docker/init.sql +++ b/config/docker/init.sql @@ -31,11 +31,14 @@ CREATE TABLE IF NOT EXISTS pages ( meta_description TEXT, status_code INT, content_type VARCHAR(100), + redirect_url VARCHAR(2048), + redirect_count INT DEFAULT 0, crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE, INDEX idx_crawl_job (crawl_job_id), INDEX idx_url (url(255)), INDEX idx_status_code (status_code), + INDEX idx_redirect_count (redirect_count), UNIQUE KEY unique_job_url (crawl_job_id, url(255)) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; diff --git a/src/api.php b/src/api.php index b5e3b42..5407408 100644 --- a/src/api.php +++ b/src/api.php @@ -217,6 +217,47 @@ try { ]); break; + case 'redirects': + $jobId = $_GET['job_id'] ?? 0; + $stmt = $db->prepare( + "SELECT url, title, status_code, redirect_url, redirect_count FROM pages " . + "WHERE crawl_job_id = ? AND redirect_count > 0 " . + "ORDER BY redirect_count DESC, url" + ); + $stmt->execute([$jobId]); + $redirects = $stmt->fetchAll(); + + // Count redirect types + $permanent = 0; + $temporary = 0; + $excessive = 0; + $maxThreshold = 3; // From Config::MAX_REDIRECT_THRESHOLD + + foreach ($redirects as $redirect) { + $code = $redirect['status_code']; + if ($code == 301 || $code == 308) { + $permanent++; + } elseif ($code == 302 || $code == 303 || $code == 307) { + $temporary++; + } + if ($redirect['redirect_count'] > $maxThreshold) { + $excessive++; + } + } + + echo json_encode([ + 'success' => true, + 'redirects' => $redirects, + 'stats' => [ + 'total' => count($redirects), + 'permanent' => $permanent, + 'temporary' => $temporary, + 'excessive' => $excessive, + 'threshold' => $maxThreshold + ] + ]); + break; + case 'delete': $jobId = $_POST['job_id'] ?? 0; $stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?"); diff --git a/src/classes/Config.php b/src/classes/Config.php new file mode 100644 index 0000000..4a77325 --- /dev/null +++ b/src/classes/Config.php @@ -0,0 +1,29 @@ + + * @link https://kies-media.de + */ + +namespace App; + +class Config +{ + /** + * Maximum number of redirects before warning + */ + public const int MAX_REDIRECT_THRESHOLD = 3; + + /** + * Maximum crawl depth + */ + public const int MAX_CRAWL_DEPTH = 50; + + /** + * Number of parallel requests + */ + public const int CONCURRENCY = 10; +} diff --git a/src/classes/Crawler.php b/src/classes/Crawler.php index c14cdc6..5251ecf 100644 --- a/src/classes/Crawler.php +++ b/src/classes/Crawler.php @@ -33,6 +33,10 @@ class Crawler $this->client = new Client([ 'timeout' => 30, 'verify' => false, + 'allow_redirects' => [ + 'max' => 10, + 'track_redirects' => true + ], 'headers' => [ 'User-Agent' => 'WebCrawler/1.0' ] @@ -144,6 +148,17 @@ class Crawler $contentType = $response->getHeaderLine('Content-Type'); $body = $response->getBody()->getContents(); + // Track redirects + $redirectUrl = null; + $redirectCount = 0; + if ($response->hasHeader('X-Guzzle-Redirect-History')) { + $redirectHistory = $response->getHeader('X-Guzzle-Redirect-History'); + $redirectCount = count($redirectHistory); + if ($redirectCount > 0) { + $redirectUrl = end($redirectHistory); + } + } + // Save page $domCrawler = new DomCrawler($body, $url); $title = $domCrawler->filter('title')->count() > 0 @@ -155,13 +170,24 @@ class Crawler : ''; $stmt = $this->db->prepare( - "INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, content_type) " . - "VALUES (?, ?, ?, ?, ?, ?) " . + "INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, " . + "content_type, redirect_url, redirect_count) " . + "VALUES (?, ?, ?, ?, ?, ?, ?, ?) " . "ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code), " . - "meta_description = VALUES(meta_description)" + "meta_description = VALUES(meta_description), redirect_url = VALUES(redirect_url), " . + "redirect_count = VALUES(redirect_count)" ); - $stmt->execute([$this->crawlJobId, $url, $title, $metaDescription, $statusCode, $contentType]); + $stmt->execute([ + $this->crawlJobId, + $url, + $title, + $metaDescription, + $statusCode, + $contentType, + $redirectUrl, + $redirectCount + ]); $pageId = $this->db->lastInsertId(); // If pageId is 0, fetch it manually diff --git a/src/index.php b/src/index.php index b61f221..934a477 100644 --- a/src/index.php +++ b/src/index.php @@ -251,6 +251,7 @@ + @@ -303,6 +304,25 @@ +
| URL | +Redirect To | +Status Code | +Redirect Count | +Type | +
|---|---|---|---|---|
| Keine Redirects gefunden | ||||