Initialer Push von Martin
🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
286
src/classes/Crawler.php
Normal file
286
src/classes/Crawler.php
Normal file
@@ -0,0 +1,286 @@
|
||||
<?php
|
||||
|
||||
namespace App;
|
||||
|
||||
use GuzzleHttp\Client;
|
||||
use GuzzleHttp\Pool;
|
||||
use GuzzleHttp\Psr7\Request;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
|
||||
|
||||
class Crawler {
|
||||
private \PDO $db;
|
||||
private Client $client;
|
||||
private int $concurrency = 10; // Parallel requests
|
||||
private array $visited = [];
|
||||
private int $crawlJobId;
|
||||
private string $baseDomain;
|
||||
|
||||
public function __construct(int $crawlJobId) {
|
||||
$this->db = Database::getInstance();
|
||||
$this->crawlJobId = $crawlJobId;
|
||||
$this->client = new Client([
|
||||
'timeout' => 30,
|
||||
'verify' => false,
|
||||
'headers' => [
|
||||
'User-Agent' => 'WebCrawler/1.0'
|
||||
]
|
||||
]);
|
||||
}
|
||||
|
||||
public function start(string $startUrl): void {
|
||||
$this->baseDomain = strtolower(parse_url($startUrl, PHP_URL_HOST));
|
||||
|
||||
// Update job status
|
||||
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?");
|
||||
$stmt->execute([$this->crawlJobId]);
|
||||
|
||||
// Normalize and add start URL to queue
|
||||
$normalizedStartUrl = $this->normalizeUrl($startUrl);
|
||||
$this->addToQueue($normalizedStartUrl, 0);
|
||||
|
||||
// Process queue
|
||||
$this->processQueue();
|
||||
|
||||
// Update job status
|
||||
$this->updateJobStats();
|
||||
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'completed', completed_at = NOW() WHERE id = ?");
|
||||
$stmt->execute([$this->crawlJobId]);
|
||||
}
|
||||
|
||||
private function addToQueue(string $url, int $depth): void {
|
||||
if (isset($this->visited[$url])) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
$stmt = $this->db->prepare(
|
||||
"INSERT IGNORE INTO crawl_queue (crawl_job_id, url, depth) VALUES (?, ?, ?)"
|
||||
);
|
||||
$stmt->execute([$this->crawlJobId, $url, $depth]);
|
||||
} catch (\Exception $e) {
|
||||
// URL already in queue
|
||||
}
|
||||
}
|
||||
|
||||
private function processQueue(): void {
|
||||
while (true) {
|
||||
// Get pending URLs
|
||||
$stmt = $this->db->prepare(
|
||||
"SELECT id, url, depth FROM crawl_queue
|
||||
WHERE crawl_job_id = ? AND status = 'pending'
|
||||
LIMIT ?"
|
||||
);
|
||||
$stmt->execute([$this->crawlJobId, $this->concurrency]);
|
||||
$urls = $stmt->fetchAll();
|
||||
|
||||
if (empty($urls)) {
|
||||
break;
|
||||
}
|
||||
|
||||
$this->crawlBatch($urls);
|
||||
}
|
||||
}
|
||||
|
||||
private function crawlBatch(array $urls): void {
|
||||
$requests = function() use ($urls) {
|
||||
foreach ($urls as $item) {
|
||||
// Mark as processing
|
||||
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?");
|
||||
$stmt->execute([$item['id']]);
|
||||
|
||||
yield function() use ($item) {
|
||||
return $this->client->getAsync($item['url']);
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
$pool = new Pool($this->client, $requests(), [
|
||||
'concurrency' => $this->concurrency,
|
||||
'fulfilled' => function ($response, $index) use ($urls) {
|
||||
$item = $urls[$index];
|
||||
$this->handleResponse($item, $response);
|
||||
},
|
||||
'rejected' => function ($reason, $index) use ($urls) {
|
||||
$item = $urls[$index];
|
||||
$this->handleError($item, $reason);
|
||||
},
|
||||
]);
|
||||
|
||||
$pool->promise()->wait();
|
||||
}
|
||||
|
||||
private function handleResponse(array $queueItem, $response): void {
|
||||
$url = $queueItem['url'];
|
||||
$depth = $queueItem['depth'];
|
||||
|
||||
$this->visited[$url] = true;
|
||||
|
||||
$statusCode = $response->getStatusCode();
|
||||
$contentType = $response->getHeaderLine('Content-Type');
|
||||
$body = $response->getBody()->getContents();
|
||||
|
||||
// Save page
|
||||
$domCrawler = new DomCrawler($body, $url);
|
||||
$title = $domCrawler->filter('title')->count() > 0
|
||||
? $domCrawler->filter('title')->text()
|
||||
: '';
|
||||
|
||||
$stmt = $this->db->prepare(
|
||||
"INSERT INTO pages (crawl_job_id, url, title, status_code, content_type)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code)"
|
||||
);
|
||||
|
||||
$stmt->execute([$this->crawlJobId, $url, $title, $statusCode, $contentType]);
|
||||
$pageId = $this->db->lastInsertId();
|
||||
|
||||
// If pageId is 0, fetch it manually
|
||||
if ($pageId == 0) {
|
||||
$stmt = $this->db->prepare("SELECT id FROM pages WHERE crawl_job_id = ? AND url = ?");
|
||||
$stmt->execute([$this->crawlJobId, $url]);
|
||||
$pageId = $stmt->fetchColumn();
|
||||
}
|
||||
|
||||
// Extract and save links
|
||||
if (str_contains($contentType, 'text/html')) {
|
||||
echo "Extracting links from: $url (pageId: $pageId)\n";
|
||||
$this->extractLinks($domCrawler, $url, $pageId, $depth);
|
||||
} else {
|
||||
echo "Skipping link extraction - content type: $contentType\n";
|
||||
}
|
||||
|
||||
// Mark as completed
|
||||
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'completed', processed_at = NOW() WHERE id = ?");
|
||||
$stmt->execute([$queueItem['id']]);
|
||||
}
|
||||
|
||||
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void {
|
||||
$linkCount = 0;
|
||||
$crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) {
|
||||
try {
|
||||
$linkCount++;
|
||||
$href = $node->attr('href');
|
||||
if (!$href || $href === '#') {
|
||||
return;
|
||||
}
|
||||
|
||||
// Convert relative URLs to absolute
|
||||
$targetUrl = $this->makeAbsoluteUrl($href, $sourceUrl);
|
||||
|
||||
// Get link text
|
||||
$linkText = trim($node->text());
|
||||
|
||||
// Check nofollow
|
||||
$rel = $node->attr('rel') ?? '';
|
||||
$isNofollow = str_contains($rel, 'nofollow');
|
||||
|
||||
// Check if internal (same domain, no subdomains)
|
||||
$targetDomain = strtolower(parse_url($targetUrl, PHP_URL_HOST) ?? '');
|
||||
$isInternal = ($targetDomain === $this->baseDomain);
|
||||
|
||||
// Save link
|
||||
$stmt = $this->db->prepare(
|
||||
"INSERT INTO links (page_id, crawl_job_id, source_url, target_url, link_text, is_nofollow, is_internal)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)"
|
||||
);
|
||||
$stmt->execute([
|
||||
$pageId,
|
||||
$this->crawlJobId,
|
||||
$sourceUrl,
|
||||
$targetUrl,
|
||||
$linkText,
|
||||
$isNofollow ? 1 : 0,
|
||||
$isInternal ? 1 : 0
|
||||
]);
|
||||
|
||||
// Add to queue if internal and not nofollow
|
||||
if ($isInternal && !$isNofollow && $depth < 50) {
|
||||
// Normalize URL (remove fragment, trailing slash)
|
||||
$normalizedUrl = $this->normalizeUrl($targetUrl);
|
||||
$this->addToQueue($normalizedUrl, $depth + 1);
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
echo "Error processing link: " . $e->getMessage() . "\n";
|
||||
}
|
||||
});
|
||||
echo "Processed $linkCount links from $sourceUrl\n";
|
||||
}
|
||||
|
||||
private function makeAbsoluteUrl(string $url, string $base): string {
|
||||
if (filter_var($url, FILTER_VALIDATE_URL)) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
$parts = parse_url($base);
|
||||
$scheme = $parts['scheme'] ?? 'http';
|
||||
$host = $parts['host'] ?? '';
|
||||
$path = $parts['path'] ?? '/';
|
||||
|
||||
if ($url[0] === '/') {
|
||||
return "$scheme://$host$url";
|
||||
}
|
||||
|
||||
$basePath = substr($path, 0, strrpos($path, '/') + 1);
|
||||
return "$scheme://$host$basePath$url";
|
||||
}
|
||||
|
||||
private function handleError(array $queueItem, $reason): void {
|
||||
$stmt = $this->db->prepare(
|
||||
"UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?"
|
||||
);
|
||||
$stmt->execute([$queueItem['id']]);
|
||||
}
|
||||
|
||||
private function updateJobStats(): void {
|
||||
$stmt = $this->db->prepare(
|
||||
"UPDATE crawl_jobs SET
|
||||
total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?),
|
||||
total_links = (SELECT COUNT(*) FROM links WHERE crawl_job_id = ?)
|
||||
WHERE id = ?"
|
||||
);
|
||||
$stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]);
|
||||
}
|
||||
|
||||
private function normalizeUrl(string $url): string {
|
||||
// Parse URL
|
||||
$parts = parse_url($url);
|
||||
|
||||
if (!$parts) {
|
||||
return $url;
|
||||
}
|
||||
|
||||
// Remove fragment
|
||||
unset($parts['fragment']);
|
||||
|
||||
// Normalize domain (add www if base domain has it, or remove if base doesn't)
|
||||
if (isset($parts['host'])) {
|
||||
// Always convert to lowercase
|
||||
$parts['host'] = strtolower($parts['host']);
|
||||
|
||||
// Match www pattern with base domain
|
||||
$baseHasWww = str_starts_with($this->baseDomain, 'www.');
|
||||
$urlHasWww = str_starts_with($parts['host'], 'www.');
|
||||
|
||||
if ($baseHasWww && !$urlHasWww) {
|
||||
$parts['host'] = 'www.' . $parts['host'];
|
||||
} elseif (!$baseHasWww && $urlHasWww) {
|
||||
$parts['host'] = substr($parts['host'], 4);
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize path - remove trailing slash except for root
|
||||
if (isset($parts['path']) && $parts['path'] !== '/') {
|
||||
$parts['path'] = rtrim($parts['path'], '/');
|
||||
}
|
||||
|
||||
// Rebuild URL
|
||||
$scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : '';
|
||||
$host = $parts['host'] ?? '';
|
||||
$port = isset($parts['port']) ? ':' . $parts['port'] : '';
|
||||
$path = $parts['path'] ?? '/';
|
||||
$query = isset($parts['query']) ? '?' . $parts['query'] : '';
|
||||
|
||||
return $scheme . $host . $port . $path . $query;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user