Add Broken Links detection and SEO Analysis features
Database Schema: - Added meta_description TEXT field to pages table - Added index on status_code for faster broken link queries Backend Changes: - Crawler now extracts meta descriptions from pages - New API endpoint: broken-links (finds 404s and server errors) - New API endpoint: seo-analysis (analyzes titles and meta descriptions) SEO Analysis Features: - Title length validation (optimal: 30-60 chars) - Meta description length validation (optimal: 70-160 chars) - Detection of missing titles/descriptions - Duplicate content detection (titles and meta descriptions) Frontend Changes: - Added "Broken Links" tab showing pages with errors - Added "SEO Analysis" tab with: * Statistics overview * Pages with SEO issues * Duplicate content report All quality checks pass: - PHPStan Level 8: 0 errors - PHPCS PSR-12: 0 warnings 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -28,12 +28,14 @@ CREATE TABLE IF NOT EXISTS pages (
|
||||
crawl_job_id INT NOT NULL,
|
||||
url VARCHAR(2048) NOT NULL,
|
||||
title VARCHAR(500),
|
||||
meta_description TEXT,
|
||||
status_code INT,
|
||||
content_type VARCHAR(100),
|
||||
crawled_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY (crawl_job_id) REFERENCES crawl_jobs(id) ON DELETE CASCADE,
|
||||
INDEX idx_crawl_job (crawl_job_id),
|
||||
INDEX idx_url (url(255)),
|
||||
INDEX idx_status_code (status_code),
|
||||
UNIQUE KEY unique_job_url (crawl_job_id, url(255))
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
|
||||
101
src/api.php
101
src/api.php
@@ -116,6 +116,107 @@ try {
|
||||
]);
|
||||
break;
|
||||
|
||||
case 'broken-links':
|
||||
$jobId = $_GET['job_id'] ?? 0;
|
||||
$stmt = $db->prepare(
|
||||
"SELECT * FROM pages " .
|
||||
"WHERE crawl_job_id = ? AND (status_code >= 400 OR status_code = 0) " .
|
||||
"ORDER BY status_code DESC, url"
|
||||
);
|
||||
$stmt->execute([$jobId]);
|
||||
$brokenLinks = $stmt->fetchAll();
|
||||
|
||||
echo json_encode([
|
||||
'success' => true,
|
||||
'broken_links' => $brokenLinks
|
||||
]);
|
||||
break;
|
||||
|
||||
case 'seo-analysis':
|
||||
$jobId = $_GET['job_id'] ?? 0;
|
||||
$stmt = $db->prepare(
|
||||
"SELECT id, url, title, meta_description, status_code FROM pages " .
|
||||
"WHERE crawl_job_id = ? ORDER BY url"
|
||||
);
|
||||
$stmt->execute([$jobId]);
|
||||
$pages = $stmt->fetchAll();
|
||||
|
||||
$issues = [];
|
||||
foreach ($pages as $page) {
|
||||
$pageIssues = [];
|
||||
$titleLen = mb_strlen($page['title'] ?? '');
|
||||
$descLen = mb_strlen($page['meta_description'] ?? '');
|
||||
|
||||
// Title issues (Google: 50-60 chars optimal)
|
||||
if (empty($page['title'])) {
|
||||
$pageIssues[] = 'Title missing';
|
||||
} elseif ($titleLen < 30) {
|
||||
$pageIssues[] = "Title too short ({$titleLen} chars)";
|
||||
} elseif ($titleLen > 60) {
|
||||
$pageIssues[] = "Title too long ({$titleLen} chars)";
|
||||
}
|
||||
|
||||
// Meta description issues (Google: 120-160 chars optimal)
|
||||
if (empty($page['meta_description'])) {
|
||||
$pageIssues[] = 'Meta description missing';
|
||||
} elseif ($descLen < 70) {
|
||||
$pageIssues[] = "Meta description too short ({$descLen} chars)";
|
||||
} elseif ($descLen > 160) {
|
||||
$pageIssues[] = "Meta description too long ({$descLen} chars)";
|
||||
}
|
||||
|
||||
if (!empty($pageIssues)) {
|
||||
$issues[] = [
|
||||
'url' => $page['url'],
|
||||
'title' => $page['title'],
|
||||
'title_length' => $titleLen,
|
||||
'meta_description' => $page['meta_description'],
|
||||
'meta_length' => $descLen,
|
||||
'issues' => $pageIssues
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
// Find duplicates
|
||||
$titleCounts = [];
|
||||
$descCounts = [];
|
||||
foreach ($pages as $page) {
|
||||
if (!empty($page['title'])) {
|
||||
$titleCounts[$page['title']][] = $page['url'];
|
||||
}
|
||||
if (!empty($page['meta_description'])) {
|
||||
$descCounts[$page['meta_description']][] = $page['url'];
|
||||
}
|
||||
}
|
||||
|
||||
$duplicates = [];
|
||||
foreach ($titleCounts as $title => $urls) {
|
||||
if (count($urls) > 1) {
|
||||
$duplicates[] = [
|
||||
'type' => 'title',
|
||||
'content' => $title,
|
||||
'urls' => $urls
|
||||
];
|
||||
}
|
||||
}
|
||||
foreach ($descCounts as $desc => $urls) {
|
||||
if (count($urls) > 1) {
|
||||
$duplicates[] = [
|
||||
'type' => 'meta_description',
|
||||
'content' => $desc,
|
||||
'urls' => $urls
|
||||
];
|
||||
}
|
||||
}
|
||||
|
||||
echo json_encode([
|
||||
'success' => true,
|
||||
'issues' => $issues,
|
||||
'duplicates' => $duplicates,
|
||||
'total_pages' => count($pages)
|
||||
]);
|
||||
break;
|
||||
|
||||
case 'delete':
|
||||
$jobId = $_POST['job_id'] ?? 0;
|
||||
$stmt = $db->prepare("DELETE FROM crawl_jobs WHERE id = ?");
|
||||
|
||||
@@ -150,13 +150,18 @@ class Crawler
|
||||
? $domCrawler->filter('title')->text()
|
||||
: '';
|
||||
|
||||
$metaDescription = $domCrawler->filter('meta[name="description"]')->count() > 0
|
||||
? $domCrawler->filter('meta[name="description"]')->attr('content')
|
||||
: '';
|
||||
|
||||
$stmt = $this->db->prepare(
|
||||
"INSERT INTO pages (crawl_job_id, url, title, status_code, content_type)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code)"
|
||||
"INSERT INTO pages (crawl_job_id, url, title, meta_description, status_code, content_type) " .
|
||||
"VALUES (?, ?, ?, ?, ?, ?) " .
|
||||
"ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code), " .
|
||||
"meta_description = VALUES(meta_description)"
|
||||
);
|
||||
|
||||
$stmt->execute([$this->crawlJobId, $url, $title, $statusCode, $contentType]);
|
||||
$stmt->execute([$this->crawlJobId, $url, $title, $metaDescription, $statusCode, $contentType]);
|
||||
$pageId = $this->db->lastInsertId();
|
||||
|
||||
// If pageId is 0, fetch it manually
|
||||
|
||||
108
src/index.php
108
src/index.php
@@ -250,6 +250,8 @@
|
||||
<div class="tabs">
|
||||
<button class="tab active" onclick="switchTab('pages')">Seiten</button>
|
||||
<button class="tab" onclick="switchTab('links')">Links</button>
|
||||
<button class="tab" onclick="switchTab('broken')">Broken Links</button>
|
||||
<button class="tab" onclick="switchTab('seo')">SEO Analysis</button>
|
||||
</div>
|
||||
|
||||
<div class="tab-content active" id="pages-tab">
|
||||
@@ -284,6 +286,43 @@
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="broken-tab">
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>URL</th>
|
||||
<th>Status Code</th>
|
||||
<th>Titel</th>
|
||||
<th>Gecrawlt</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="brokenBody">
|
||||
<tr><td colspan="4" class="loading">Keine defekten Links gefunden</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="tab-content" id="seo-tab">
|
||||
<h3>SEO Issues</h3>
|
||||
<div id="seoStats" style="margin-bottom: 20px;"></div>
|
||||
<table>
|
||||
<thead>
|
||||
<tr>
|
||||
<th>URL</th>
|
||||
<th>Title (Länge)</th>
|
||||
<th>Meta Description (Länge)</th>
|
||||
<th>Issues</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="seoIssuesBody">
|
||||
<tr><td colspan="4" class="loading">Keine SEO-Probleme gefunden</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h3 style="margin-top: 30px;">Duplicate Content</h3>
|
||||
<div id="seoDuplicatesBody"></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
@@ -441,6 +480,75 @@
|
||||
`).join('');
|
||||
}
|
||||
|
||||
// Load broken links
|
||||
const brokenResponse = await fetch(`/api.php?action=broken-links&job_id=${currentJobId}`);
|
||||
const brokenData = await brokenResponse.json();
|
||||
|
||||
if (brokenData.success && brokenData.broken_links.length > 0) {
|
||||
document.getElementById('brokenBody').innerHTML = brokenData.broken_links.map(page => `
|
||||
<tr>
|
||||
<td class="url-cell" title="${page.url}">${page.url}</td>
|
||||
<td><span class="status failed">${page.status_code || 'Error'}</span></td>
|
||||
<td>${page.title || '-'}</td>
|
||||
<td>${page.crawled_at}</td>
|
||||
</tr>
|
||||
`).join('');
|
||||
} else {
|
||||
document.getElementById('brokenBody').innerHTML = '<tr><td colspan="4" class="loading">Keine defekten Links gefunden</td></tr>';
|
||||
}
|
||||
|
||||
// Load SEO analysis
|
||||
const seoResponse = await fetch(`/api.php?action=seo-analysis&job_id=${currentJobId}`);
|
||||
const seoData = await seoResponse.json();
|
||||
|
||||
if (seoData.success) {
|
||||
// SEO Stats
|
||||
document.getElementById('seoStats').innerHTML = `
|
||||
<div class="stat-box">
|
||||
<div class="stat-label">Total Pages</div>
|
||||
<div class="stat-value">${seoData.total_pages}</div>
|
||||
</div>
|
||||
<div class="stat-box">
|
||||
<div class="stat-label">Pages with Issues</div>
|
||||
<div class="stat-value">${seoData.issues.length}</div>
|
||||
</div>
|
||||
<div class="stat-box">
|
||||
<div class="stat-label">Duplicates Found</div>
|
||||
<div class="stat-value">${seoData.duplicates.length}</div>
|
||||
</div>
|
||||
`;
|
||||
|
||||
// SEO Issues
|
||||
if (seoData.issues.length > 0) {
|
||||
document.getElementById('seoIssuesBody').innerHTML = seoData.issues.map(item => `
|
||||
<tr>
|
||||
<td class="url-cell" title="${item.url}">${item.url}</td>
|
||||
<td>${item.title || '-'} (${item.title_length})</td>
|
||||
<td>${item.meta_description ? item.meta_description.substring(0, 50) + '...' : '-'} (${item.meta_length})</td>
|
||||
<td><span class="nofollow">${item.issues.join(', ')}</span></td>
|
||||
</tr>
|
||||
`).join('');
|
||||
} else {
|
||||
document.getElementById('seoIssuesBody').innerHTML = '<tr><td colspan="4" class="loading">Keine SEO-Probleme gefunden</td></tr>';
|
||||
}
|
||||
|
||||
// Duplicates
|
||||
if (seoData.duplicates.length > 0) {
|
||||
document.getElementById('seoDuplicatesBody').innerHTML = seoData.duplicates.map(dup => `
|
||||
<div class="stat-box" style="margin-bottom: 15px;">
|
||||
<div class="stat-label">Duplicate ${dup.type}</div>
|
||||
<div style="font-size: 14px; margin: 10px 0;"><strong>${dup.content}</strong></div>
|
||||
<div style="font-size: 12px;">Found on ${dup.urls.length} pages:</div>
|
||||
<ul style="margin-top: 5px; font-size: 12px;">
|
||||
${dup.urls.map(url => `<li>${url}</li>`).join('')}
|
||||
</ul>
|
||||
</div>
|
||||
`).join('');
|
||||
} else {
|
||||
document.getElementById('seoDuplicatesBody').innerHTML = '<p>Keine doppelten Inhalte gefunden</p>';
|
||||
}
|
||||
}
|
||||
|
||||
// Update jobs table
|
||||
loadJobs();
|
||||
} catch (e) {
|
||||
|
||||
Reference in New Issue
Block a user