Add Broken Links detection and SEO Analysis features

Database Schema:
- Added meta_description TEXT field to pages table
- Added index on status_code for faster broken link queries

Backend Changes:
- Crawler now extracts meta descriptions from pages
- New API endpoint: broken-links (finds 404s and server errors)
- New API endpoint: seo-analysis (analyzes titles and meta descriptions)

SEO Analysis Features:
- Title length validation (optimal: 30-60 chars)
- Meta description length validation (optimal: 70-160 chars)
- Detection of missing titles/descriptions
- Duplicate content detection (titles and meta descriptions)

Frontend Changes:
- Added "Broken Links" tab showing pages with errors
- Added "SEO Analysis" tab with:
  * Statistics overview
  * Pages with SEO issues
  * Duplicate content report

All quality checks pass:
- PHPStan Level 8: 0 errors
- PHPCS PSR-12: 0 warnings

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-04 09:26:33 +02:00
parent 9e61572747
commit f7be09ec63
4 changed files with 220 additions and 4 deletions

View File

@@ -28,12 +28,14 @@ CREATE TABLE IF NOT EXISTS pages (
crawl_job_id INT NOT NULL,
url VARCHAR(2048) NOT NULL,
title VARCHAR(500),
meta_description TEXT,
status_code INT,
content_type VARCHAR(100),
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
INDEX idx_crawl_job (crawl_job_id),
INDEX idx_url (url(255)),
INDEX idx_status_code (status_code),
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;

View File

@@ -116,6 +116,107 @@ try {
]);
break;
case 'broken-links':
$jobId = $_GET['job_id'] ?? 0;
$stmt = $db->prepare(
"SELECT * FROM pages " .
"WHERE crawl_job_id = ? AND (status_code >= 400 OR status_code = 0) " .
"ORDER BY status_code DESC, url"
);
$stmt->execute([$jobId]);
$brokenLinks = $stmt->fetchAll();
echo json_encode([
'success' => true,
'broken_links' => $brokenLinks
]);
break;
case 'seo-analysis':
$jobId = $_GET['job_id'] ?? 0;
$stmt = $db->prepare(
"SELECT id, url, title, meta_description, status_code FROM pages " .
"WHERE crawl_job_id = ? ORDER BY url"
);
$stmt->execute([$jobId]);
$pages = $stmt->fetchAll();
$issues = [];
foreach ($pages as $page) {
$pageIssues = [];
$titleLen = mb_strlen($page['title'] ?? '');
$descLen = mb_strlen($page['meta_description'] ?? '');
// Title issues (Google: 50-60 chars optimal)
if (empty($page['title'])) {
$pageIssues[] = 'Title missing';
} elseif ($titleLen < 30) {
$pageIssues[] = "Title too short ({$titleLen} chars)";
} elseif ($titleLen > 60) {
$pageIssues[] = "Title too long ({$titleLen} chars)";
}
// Meta description issues (Google: 120-160 chars optimal)
if (empty($page['meta_description'])) {
$pageIssues[] = 'Meta description missing';
} elseif ($descLen < 70) {
$pageIssues[] = "Meta description too short ({$descLen} chars)";
} elseif ($descLen > 160) {
$pageIssues[] = "Meta description too long ({$descLen} chars)";
}
if (!empty($pageIssues)) {
$issues[] = [
'url' => $page['url'],
'title' => $page['title'],
'title_length' => $titleLen,
'meta_description' => $page['meta_description'],
'meta_length' => $descLen,
'issues' => $pageIssues
];
}
}
// Find duplicates
$titleCounts = [];
$descCounts = [];
foreach ($pages as $page) {
if (!empty($page['title'])) {
$titleCounts[$page['title']][] = $page['url'];
}
if (!empty($page['meta_description'])) {
$descCounts[$page['meta_description']][] = $page['url'];
}
}
$duplicates = [];
foreach ($titleCounts as $title => $urls) {
if (count($urls) > 1) {
$duplicates[] = [
'type' => 'title',
'content' => $title,
'urls' => $urls
];
}
}
foreach ($descCounts as $desc => $urls) {
if (count($urls) > 1) {
$duplicates[] = [
'type' => 'meta_description',
'content' => $desc,
'urls' => $urls
];
}
}
echo json_encode([
'success' => true,
'issues' => $issues,
'duplicates' => $duplicates,
'total_pages' => count($pages)
]);
break;
case 'delete':
$jobId = $_POST['job_id'] ?? 0;
$stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?");

View File

@@ -150,13 +150,18 @@ class Crawler
? $domCrawler->filter('title')->text()
: '';
$metaDescription = $domCrawler->filter('meta[name="description"]')->count() > 0
? $domCrawler->filter('meta[name="description"]')->attr('content')
: '';
$stmt = $this->db->prepare(
"INSERT INTO pages (crawl_job_id, url, title, status_code, content_type)
VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code)"
"INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, content_type) " .
"VALUES (?, ?, ?, ?, ?, ?) " .
"ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code), " .
"meta_description = VALUES(meta_description)"
);
$stmt->execute([$this->crawlJobId, $url, $title, $statusCode, $contentType]);
$stmt->execute([$this->crawlJobId, $url, $title, $metaDescription, $statusCode, $contentType]);
$pageId = $this->db->lastInsertId();
// If pageId is 0, fetch it manually

View File

@@ -250,6 +250,8 @@
<div class="tabs">
<button class="tab active" onclick="switchTab('pages')">Seiten</button>
<button class="tab" onclick="switchTab('links')">Links</button>
<button class="tab" onclick="switchTab('broken')">Broken Links</button>
<button class="tab" onclick="switchTab('seo')">SEO Analysis</button>
</div>
<div class="tab-content active" id="pages-tab">
@@ -284,6 +286,43 @@
</tbody>
</table>
</div>
<div class="tab-content" id="broken-tab">
<table>
<thead>
<tr>
<th>URL</th>
<th>Status Code</th>
<th>Titel</th>
<th>Gecrawlt</th>
</tr>
</thead>
<tbody id="brokenBody">
<tr><td colspan="4" class="loading">Keine defekten Links gefunden</td></tr>
</tbody>
</table>
</div>
<div class="tab-content" id="seo-tab">
<h3>SEO Issues</h3>
<div id="seoStats" style="margin-bottom: 20px;"></div>
<table>
<thead>
<tr>
<th>URL</th>
<th>Title (Länge)</th>
<th>Meta Description (Länge)</th>
<th>Issues</th>
</tr>
</thead>
<tbody id="seoIssuesBody">
<tr><td colspan="4" class="loading">Keine SEO-Probleme gefunden</td></tr>
</tbody>
</table>
<h3 style="margin-top: 30px;">Duplicate Content</h3>
<div id="seoDuplicatesBody"></div>
</div>
</div>
</div>
</div>
@@ -441,6 +480,75 @@
`).join('');
}
// Load broken links
const brokenResponse = await fetch(`/api.php?action=broken-links&job_id=${currentJobId}`);
const brokenData = await brokenResponse.json();
if (brokenData.success && brokenData.broken_links.length > 0) {
document.getElementById('brokenBody').innerHTML = brokenData.broken_links.map(page => `
<tr>
<td class="url-cell" title="${page.url}">${page.url}</td>
<td><span class="status failed">${page.status_code || 'Error'}</span></td>
<td>${page.title || '-'}</td>
<td>${page.crawled_at}</td>
</tr>
`).join('');
} else {
document.getElementById('brokenBody').innerHTML = '<tr><td colspan="4" class="loading">Keine defekten Links gefunden</td></tr>';
}
// Load SEO analysis
const seoResponse = await fetch(`/api.php?action=seo-analysis&job_id=${currentJobId}`);
const seoData = await seoResponse.json();
if (seoData.success) {
// SEO Stats
document.getElementById('seoStats').innerHTML = `
<div class="stat-box">
<div class="stat-label">Total Pages</div>
<div class="stat-value">${seoData.total_pages}</div>
</div>
<div class="stat-box">
<div class="stat-label">Pages with Issues</div>
<div class="stat-value">${seoData.issues.length}</div>
</div>
<div class="stat-box">
<div class="stat-label">Duplicates Found</div>
<div class="stat-value">${seoData.duplicates.length}</div>
</div>
`;
// SEO Issues
if (seoData.issues.length > 0) {
document.getElementById('seoIssuesBody').innerHTML = seoData.issues.map(item => `
<tr>
<td class="url-cell" title="${item.url}">${item.url}</td>
<td>${item.title || '-'} (${item.title_length})</td>
<td>${item.meta_description ? item.meta_description.substring(0, 50) + '...' : '-'} (${item.meta_length})</td>
<td><span class="nofollow">${item.issues.join(', ')}</span></td>
</tr>
`).join('');
} else {
document.getElementById('seoIssuesBody').innerHTML = '<tr><td colspan="4" class="loading">Keine SEO-Probleme gefunden</td></tr>';
}
// Duplicates
if (seoData.duplicates.length > 0) {
document.getElementById('seoDuplicatesBody').innerHTML = seoData.duplicates.map(dup => `
<div class="stat-box" style="margin-bottom: 15px;">
<div class="stat-label">Duplicate ${dup.type}</div>
<div style="font-size: 14px; margin: 10px 0;"><strong>${dup.content}</strong></div>
<div style="font-size: 12px;">Found on ${dup.urls.length} pages:</div>
<ul style="margin-top: 5px; font-size: 12px;">
${dup.urls.map(url => `<li>${url}</li>`).join('')}
</ul>
</div>
`).join('');
} else {
document.getElementById('seoDuplicatesBody').innerHTML = '<p>Keine doppelten Inhalte gefunden</p>';
}
}
// Update jobs table
loadJobs();
} catch (e) {