- Added "Recrawl" button in jobs table UI - Implemented recrawl API endpoint that deletes all job data and restarts crawl - Fixed PHPCS line length warnings in api.php and Crawler.php All quality checks pass: - PHPStan Level 8: 0 errors - PHPCS PSR-12: 0 warnings 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
325 lines
11 KiB
PHP
325 lines
11 KiB
PHP
<?php
|
|
|
|
/**
|
|
* Web Crawler - Crawler Class
|
|
*
|
|
* @copyright Copyright (c) 2025 Martin Kiesewetter
|
|
* @author Martin Kiesewetter <mki@kies-media.de>
|
|
* @link https://kies-media.de
|
|
*/
|
|
|
|
namespace App;
|
|
|
|
use GuzzleHttp\Client;
|
|
use GuzzleHttp\Pool;
|
|
use GuzzleHttp\Psr7\Request;
|
|
use GuzzleHttp\Exception\RequestException;
|
|
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
|
|
|
|
class Crawler
|
|
{
|
|
private \PDO $db;
|
|
private Client $client;
|
|
private int $concurrency = 10; // Parallel requests
|
|
/** @var array<string, bool> */
|
|
private array $visited = [];
|
|
private int $crawlJobId;
|
|
private string $baseDomain;
|
|
|
|
public function __construct(int $crawlJobId)
|
|
{
|
|
$this->db = Database::getInstance();
|
|
$this->crawlJobId = $crawlJobId;
|
|
$this->client = new Client([
|
|
'timeout' => 30,
|
|
'verify' => false,
|
|
'headers' => [
|
|
'User-Agent' => 'WebCrawler/1.0'
|
|
]
|
|
]);
|
|
}
|
|
|
|
public function start(string $startUrl): void
|
|
{
|
|
$host = parse_url($startUrl, PHP_URL_HOST);
|
|
$this->baseDomain = strtolower($host ?: '');
|
|
|
|
// Update job status
|
|
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?");
|
|
$stmt->execute([$this->crawlJobId]);
|
|
|
|
// Normalize and add start URL to queue
|
|
$normalizedStartUrl = $this->normalizeUrl($startUrl);
|
|
$this->addToQueue($normalizedStartUrl, 0);
|
|
|
|
// Process queue
|
|
$this->processQueue();
|
|
|
|
// Update job status
|
|
$this->updateJobStats();
|
|
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'completed', completed_at = NOW() WHERE id = ?");
|
|
$stmt->execute([$this->crawlJobId]);
|
|
}
|
|
|
|
private function addToQueue(string $url, int $depth): void
|
|
{
|
|
if (isset($this->visited[$url])) {
|
|
return;
|
|
}
|
|
|
|
try {
|
|
$stmt = $this->db->prepare(
|
|
"INSERT IGNORE INTO crawl_queue (crawl_job_id, url, depth) VALUES (?, ?, ?)"
|
|
);
|
|
$stmt->execute([$this->crawlJobId, $url, $depth]);
|
|
} catch (\Exception $e) {
|
|
// URL already in queue
|
|
}
|
|
}
|
|
|
|
private function processQueue(): void
|
|
{
|
|
while (true) {
|
|
// Get pending URLs
|
|
$stmt = $this->db->prepare(
|
|
"SELECT id, url, depth FROM crawl_queue
|
|
WHERE crawl_job_id = ? AND status = 'pending'
|
|
LIMIT ?"
|
|
);
|
|
$stmt->execute([$this->crawlJobId, $this->concurrency]);
|
|
$urls = $stmt->fetchAll();
|
|
|
|
if (empty($urls)) {
|
|
break;
|
|
}
|
|
|
|
$this->crawlBatch($urls);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @param array<int, array{id: int, url: string, depth: int}> $urls
|
|
*/
|
|
private function crawlBatch(array $urls): void
|
|
{
|
|
$requests = function () use ($urls) {
|
|
foreach ($urls as $item) {
|
|
// Mark as processing
|
|
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?");
|
|
$stmt->execute([$item['id']]);
|
|
|
|
yield function () use ($item) {
|
|
return $this->client->getAsync($item['url']);
|
|
};
|
|
}
|
|
};
|
|
|
|
$pool = new Pool($this->client, $requests(), [
|
|
'concurrency' => $this->concurrency,
|
|
'fulfilled' => function ($response, $index) use ($urls) {
|
|
$item = $urls[$index];
|
|
$this->handleResponse($item, $response);
|
|
},
|
|
'rejected' => function ($reason, $index) use ($urls) {
|
|
$item = $urls[$index];
|
|
$this->handleError($item, $reason);
|
|
},
|
|
]);
|
|
|
|
$pool->promise()->wait();
|
|
}
|
|
|
|
/**
|
|
* @param array{id: int, url: string, depth: int} $queueItem
|
|
* @param \Psr\Http\Message\ResponseInterface $response
|
|
*/
|
|
private function handleResponse(array $queueItem, $response): void
|
|
{
|
|
$url = $queueItem['url'];
|
|
$depth = $queueItem['depth'];
|
|
|
|
$this->visited[$url] = true;
|
|
|
|
$statusCode = $response->getStatusCode();
|
|
$contentType = $response->getHeaderLine('Content-Type');
|
|
$body = $response->getBody()->getContents();
|
|
|
|
// Save page
|
|
$domCrawler = new DomCrawler($body, $url);
|
|
$title = $domCrawler->filter('title')->count() > 0
|
|
? $domCrawler->filter('title')->text()
|
|
: '';
|
|
|
|
$stmt = $this->db->prepare(
|
|
"INSERT INTO pages (crawl_job_id, url, title, status_code, content_type)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
ON DUPLICATE KEY UPDATE id=LAST_INSERT_ID(id), status_code = VALUES(status_code)"
|
|
);
|
|
|
|
$stmt->execute([$this->crawlJobId, $url, $title, $statusCode, $contentType]);
|
|
$pageId = $this->db->lastInsertId();
|
|
|
|
// If pageId is 0, fetch it manually
|
|
if ($pageId == 0 || $pageId === '0') {
|
|
$stmt = $this->db->prepare("SELECT id FROM pages WHERE crawl_job_id = ? AND url = ?");
|
|
$stmt->execute([$this->crawlJobId, $url]);
|
|
$fetchedId = $stmt->fetchColumn();
|
|
$pageId = is_numeric($fetchedId) ? (int)$fetchedId : 0;
|
|
}
|
|
|
|
// Ensure pageId is an integer
|
|
$pageId = is_numeric($pageId) ? (int)$pageId : 0;
|
|
|
|
// Extract and save links
|
|
if (str_contains($contentType, 'text/html') && $pageId > 0) {
|
|
echo "Extracting links from: $url (pageId: $pageId)\n";
|
|
$this->extractLinks($domCrawler, $url, $pageId, $depth);
|
|
} else {
|
|
echo "Skipping link extraction - content type: $contentType\n";
|
|
}
|
|
|
|
// Mark as completed
|
|
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'completed', processed_at = NOW() WHERE id = ?");
|
|
$stmt->execute([$queueItem['id']]);
|
|
}
|
|
|
|
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void
|
|
{
|
|
$linkCount = 0;
|
|
$crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) {
|
|
try {
|
|
$linkCount++;
|
|
$href = $node->attr('href');
|
|
if (!$href || $href === '#') {
|
|
return;
|
|
}
|
|
|
|
// Convert relative URLs to absolute
|
|
$targetUrl = $this->makeAbsoluteUrl($href, $sourceUrl);
|
|
|
|
// Get link text
|
|
$linkText = trim($node->text());
|
|
|
|
// Check nofollow
|
|
$rel = $node->attr('rel') ?? '';
|
|
$isNofollow = str_contains($rel, 'nofollow');
|
|
|
|
// Check if internal (same domain, no subdomains)
|
|
$targetHost = parse_url($targetUrl, PHP_URL_HOST);
|
|
$targetDomain = strtolower($targetHost ?: '');
|
|
$isInternal = ($targetDomain === $this->baseDomain);
|
|
|
|
// Save link
|
|
$stmt = $this->db->prepare(
|
|
"INSERT INTO links (page_id, crawl_job_id, source_url, target_url, " .
|
|
"link_text, is_nofollow, is_internal) VALUES (?, ?, ?, ?, ?, ?, ?)"
|
|
);
|
|
$stmt->execute([
|
|
$pageId,
|
|
$this->crawlJobId,
|
|
$sourceUrl,
|
|
$targetUrl,
|
|
$linkText,
|
|
$isNofollow ? 1 : 0,
|
|
$isInternal ? 1 : 0
|
|
]);
|
|
|
|
// Add to queue if internal and not nofollow
|
|
if ($isInternal && !$isNofollow && $depth < 50) {
|
|
// Normalize URL (remove fragment, trailing slash)
|
|
$normalizedUrl = $this->normalizeUrl($targetUrl);
|
|
$this->addToQueue($normalizedUrl, $depth + 1);
|
|
}
|
|
} catch (\Exception $e) {
|
|
echo "Error processing link: " . $e->getMessage() . "\n";
|
|
}
|
|
});
|
|
echo "Processed $linkCount links from $sourceUrl\n";
|
|
}
|
|
|
|
private function makeAbsoluteUrl(string $url, string $base): string
|
|
{
|
|
if (filter_var($url, FILTER_VALIDATE_URL)) {
|
|
return $url;
|
|
}
|
|
|
|
$parts = parse_url($base);
|
|
$scheme = $parts['scheme'] ?? 'http';
|
|
$host = $parts['host'] ?? '';
|
|
$path = $parts['path'] ?? '/';
|
|
|
|
if ($url[0] === '/') {
|
|
return "$scheme://$host$url";
|
|
}
|
|
|
|
$basePath = substr($path, 0, strrpos($path, '/') + 1);
|
|
return "$scheme://$host$basePath$url";
|
|
}
|
|
|
|
/**
|
|
* @param array{id: int, url: string, depth: int} $queueItem
|
|
* @param \GuzzleHttp\Exception\RequestException $reason
|
|
*/
|
|
private function handleError(array $queueItem, $reason): void
|
|
{
|
|
$stmt = $this->db->prepare(
|
|
"UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?"
|
|
);
|
|
$stmt->execute([$queueItem['id']]);
|
|
}
|
|
|
|
private function updateJobStats(): void
|
|
{
|
|
$stmt = $this->db->prepare(
|
|
"UPDATE crawl_jobs SET
|
|
total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?),
|
|
total_links = (SELECT COUNT(*) FROM links WHERE crawl_job_id = ?)
|
|
WHERE id = ?"
|
|
);
|
|
$stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]);
|
|
}
|
|
|
|
private function normalizeUrl(string $url): string
|
|
{
|
|
// Parse URL
|
|
$parts = parse_url($url);
|
|
|
|
if (!$parts) {
|
|
return $url;
|
|
}
|
|
|
|
// Remove fragment
|
|
unset($parts['fragment']);
|
|
|
|
// Normalize domain (add www if base domain has it, or remove if base doesn't)
|
|
if (isset($parts['host'])) {
|
|
// Always convert to lowercase
|
|
$parts['host'] = strtolower($parts['host']);
|
|
|
|
// Match www pattern with base domain
|
|
$baseHasWww = str_starts_with($this->baseDomain, 'www.');
|
|
$urlHasWww = str_starts_with($parts['host'], 'www.');
|
|
|
|
if ($baseHasWww && !$urlHasWww) {
|
|
$parts['host'] = 'www.' . $parts['host'];
|
|
} elseif (!$baseHasWww && $urlHasWww) {
|
|
$parts['host'] = substr($parts['host'], 4);
|
|
}
|
|
}
|
|
|
|
// Normalize path - remove trailing slash except for root
|
|
if (isset($parts['path']) && $parts['path'] !== '/') {
|
|
$parts['path'] = rtrim($parts['path'], '/');
|
|
}
|
|
|
|
// Rebuild URL
|
|
$scheme = isset($parts['scheme']) ? $parts['scheme'] . '://' : '';
|
|
$host = $parts['host'] ?? '';
|
|
$port = isset($parts['port']) ? ':' . $parts['port'] : '';
|
|
$path = $parts['path'] ?? '/';
|
|
$query = isset($parts['query']) ? '?' . $parts['query'] : '';
|
|
|
|
return $scheme . $host . $port . $path . $query;
|
|
}
|
|
}
|