Initialer Push von Martin

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-03 13:55:43 +02:00
commit 2f301cec42
16 changed files with 2280 additions and 0 deletions

286
src/classes/Crawler.php Normal file
View File

@@ -0,0 +1,286 @@
<?php
namespace App;
use GuzzleHttp\Client;
use GuzzleHttp\Pool;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Exception\RequestException;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
class Crawler {
private \PDO $db;
private Client $client;
private int $concurrency = 10; // Parallel requests
private array $visited = [];
private int $crawlJobId;
private string $baseDomain;
public function __construct(int $crawlJobId) {
$this->db = Database::getInstance();
$this->crawlJobId = $crawlJobId;
$this->client = new Client([
'timeout' => 30,
'verify' => false,
'headers' => [
'User-Agent' => 'WebCrawler/1.0'
]
]);
}
public function start(string $startUrl): void {
$this->baseDomain = strtolower(parse_url($startUrl, PHP_URL_HOST));
// Update job status
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?");
$stmt->execute([$this->crawlJobId]);
// Normalize and add start URL to queue
$normalizedStartUrl = $this->normalizeUrl($startUrl);
$this->addToQueue($normalizedStartUrl, 0);
// Process queue
$this->processQueue();
// Update job status
$this->updateJobStats();
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'completed', completed_at = NOW() WHERE id = ?");
$stmt->execute([$this->crawlJobId]);
}
private function addToQueue(string $url, int $depth): void {
if (isset($this->visited[$url])) {
return;
}
try {
$stmt = $this->db->prepare(
"INSERT IGNORE INTO crawl_queue (crawl_job_id, url, depth) VALUES (?, ?, ?)"
);
$stmt->execute([$this->crawlJobId, $url, $depth]);
} catch (\Exception $e) {
// URL already in queue
}
}
private function processQueue(): void {
while (true) {
// Get pending URLs
$stmt = $this->db->prepare(
"SELECT id, url, depth FROM crawl_queue
WHERE crawl_job_id = ? AND status = 'pending'
LIMIT ?"
);
$stmt->execute([$this->crawlJobId, $this->concurrency]);
$urls = $stmt->fetchAll();
if (empty($urls)) {
break;
}
$this->crawlBatch($urls);
}
}
private function crawlBatch(array $urls): void {
$requests = function() use ($urls) {
foreach ($urls as $item) {
// Mark as processing
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?");
$stmt->execute([$item['id']]);
yield function() use ($item) {
return $this->client->getAsync($item['url']);
};
}
};
$pool = new Pool($this->client, $requests(), [
'concurrency' => $this->concurrency,
'fulfilled' => function ($response, $index) use ($urls) {
$item = $urls[$index];
$this->handleResponse($item, $response);
},
'rejected' => function ($reason, $index) use ($urls) {
$item = $urls[$index];
$this->handleError($item, $reason);
},
]);
$pool->promise()->wait();
}
private function handleResponse(array $queueItem, $response): void {
$url = $queueItem['url'];
$depth = $queueItem['depth'];
$this->visited[$url] = true;
$statusCode = $response->getStatusCode();
$contentType = $response->getHeaderLine('Content-Type');
$body = $response->getBody()->getContents();
// Save page
$domCrawler = new DomCrawler($body, $url);
$title = $domCrawler->filter('title')->count() > 0
? $domCrawler->filter('title')->text()
: '';
$stmt = $this->db->prepare(
"INSERT INTO pages (crawl_job_id, url, title, status_code, content_type)
VALUES (?, ?, ?, ?, ?)
ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code)"
);
$stmt->execute([$this->crawlJobId, $url, $title, $statusCode, $contentType]);
$pageId = $this->db->lastInsertId();
// If pageId is 0, fetch it manually
if ($pageId == 0) {
$stmt = $this->db->prepare("SELECT id FROM pages WHERE crawl_job_id = ? AND url = ?");
$stmt->execute([$this->crawlJobId, $url]);
$pageId = $stmt->fetchColumn();
}
// Extract and save links
if (str_contains($contentType, 'text/html')) {
echo "Extracting links from: $url (pageId: $pageId)\n";
$this->extractLinks($domCrawler, $url, $pageId, $depth);
} else {
echo "Skipping link extraction - content type: $contentType\n";
}
// Mark as completed
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'completed', processed_at = NOW() WHERE id = ?");
$stmt->execute([$queueItem['id']]);
}
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void {
$linkCount = 0;
$crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) {
try {
$linkCount++;
$href = $node->attr('href');
if (!$href || $href === '#') {
return;
}
// Convert relative URLs to absolute
$targetUrl = $this->makeAbsoluteUrl($href, $sourceUrl);
// Get link text
$linkText = trim($node->text());
// Check nofollow
$rel = $node->attr('rel') ?? '';
$isNofollow = str_contains($rel, 'nofollow');
// Check if internal (same domain, no subdomains)
$targetDomain = strtolower(parse_url($targetUrl, PHP_URL_HOST) ?? '');
$isInternal = ($targetDomain === $this->baseDomain);
// Save link
$stmt = $this->db->prepare(
"INSERT INTO links (page_id, crawl_job_id, source_url, target_url, link_text, is_nofollow, is_internal)
VALUES (?, ?, ?, ?, ?, ?, ?)"
);
$stmt->execute([
$pageId,
$this->crawlJobId,
$sourceUrl,
$targetUrl,
$linkText,
$isNofollow ? 1 : 0,
$isInternal ? 1 : 0
]);
// Add to queue if internal and not nofollow
if ($isInternal && !$isNofollow && $depth < 50) {
// Normalize URL (remove fragment, trailing slash)
$normalizedUrl = $this->normalizeUrl($targetUrl);
$this->addToQueue($normalizedUrl, $depth + 1);
}
} catch (\Exception $e) {
echo "Error processing link: " . $e->getMessage() . "\n";
}
});
echo "Processed $linkCount links from $sourceUrl\n";
}
private function makeAbsoluteUrl(string $url, string $base): string {
if (filter_var($url, FILTER_VALIDATE_URL)) {
return $url;
}
$parts = parse_url($base);
$scheme = $parts['scheme'] ?? 'http';
$host = $parts['host'] ?? '';
$path = $parts['path'] ?? '/';
if ($url[0] === '/') {
return "$scheme://$host$url";
}
$basePath = substr($path, 0, strrpos($path, '/') + 1);
return "$scheme://$host$basePath$url";
}
private function handleError(array $queueItem, $reason): void {
$stmt = $this->db->prepare(
"UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?"
);
$stmt->execute([$queueItem['id']]);
}
private function updateJobStats(): void {
$stmt = $this->db->prepare(
"UPDATE crawl_jobs SET
total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?),
total_links = (SELECT COUNT(*) FROM links WHERE crawl_job_id = ?)
WHERE id = ?"
);
$stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]);
}
private function normalizeUrl(string $url): string {
// Parse URL
$parts = parse_url($url);
if (!$parts) {
return $url;
}
// Remove fragment
unset($parts['fragment']);
// Normalize domain (add www if base domain has it, or remove if base doesn't)
if (isset($parts['host'])) {
// Always convert to lowercase
$parts['host'] = strtolower($parts['host']);
// Match www pattern with base domain
$baseHasWww = str_starts_with($this->baseDomain, 'www.');
$urlHasWww = str_starts_with($parts['host'], 'www.');
if ($baseHasWww && !$urlHasWww) {
$parts['host'] = 'www.' . $parts['host'];
} elseif (!$baseHasWww && $urlHasWww) {
$parts['host'] = substr($parts['host'], 4);
}
}
// Normalize path - remove trailing slash except for root
if (isset($parts['path']) && $parts['path'] !== '/') {
$parts['path'] = rtrim($parts['path'], '/');
}
// Rebuild URL
$scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : '';
$host = $parts['host'] ?? '';
$port = isset($parts['port']) ? ':' . $parts['port'] : '';
$path = $parts['path'] ?? '/';
$query = isset($parts['query']) ? '?' . $parts['query'] : '';
return $scheme . $host . $port . $path . $query;
}
}