Add comprehensive quality tooling and fix code style issues
Quality Tools Added: - PHPStan (Level 8) for static analysis - PHP_CodeSniffer (PSR-12) for code style - Updated PHPUnit test suite with type safety Code Improvements: - Fixed all PHPStan Level 8 errors (13 issues) - Auto-fixed 25 PSR-12 code style violations - Added proper type hints for arrays and method parameters - Fixed PDOStatement|false handling in api.php and tests - Improved null-safety for parse_url() calls Configuration: - phpstan.neon: Level 8, analyzes src/ and tests/ - phpcs.xml: PSR-12 standard, excludes vendor/ - docker-compose.yml: Mount config files for tooling - composer.json: Add phpstan, phpcs, phpcbf scripts Documentation: - Updated README.md with testing and quality sections - Updated AGENTS.md with quality gates and workflows - Added pre-commit checklist for developers All tests pass (9/9), PHPStan clean (0 errors), PHPCS compliant (1 warning) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -73,6 +73,9 @@ try {
|
||||
|
||||
case 'jobs':
|
||||
$stmt = $db->query("SELECT * FROM crawl_jobs ORDER BY created_at DESC LIMIT 50");
|
||||
if ($stmt === false) {
|
||||
throw new Exception('Failed to query jobs');
|
||||
}
|
||||
$jobs = $stmt->fetchAll();
|
||||
|
||||
echo json_encode([
|
||||
|
||||
@@ -8,15 +8,18 @@ use GuzzleHttp\Psr7\Request;
|
||||
use GuzzleHttp\Exception\RequestException;
|
||||
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
|
||||
|
||||
class Crawler {
|
||||
class Crawler
|
||||
{
|
||||
private \PDO $db;
|
||||
private Client $client;
|
||||
private int $concurrency = 10; // Parallel requests
|
||||
/** @var array<string, bool> */
|
||||
private array $visited = [];
|
||||
private int $crawlJobId;
|
||||
private string $baseDomain;
|
||||
|
||||
public function __construct(int $crawlJobId) {
|
||||
public function __construct(int $crawlJobId)
|
||||
{
|
||||
$this->db = Database::getInstance();
|
||||
$this->crawlJobId = $crawlJobId;
|
||||
$this->client = new Client([
|
||||
@@ -28,8 +31,10 @@ class Crawler {
|
||||
]);
|
||||
}
|
||||
|
||||
public function start(string $startUrl): void {
|
||||
$this->baseDomain = strtolower(parse_url($startUrl, PHP_URL_HOST));
|
||||
public function start(string $startUrl): void
|
||||
{
|
||||
$host = parse_url($startUrl, PHP_URL_HOST);
|
||||
$this->baseDomain = strtolower($host ?: '');
|
||||
|
||||
// Update job status
|
||||
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?");
|
||||
@@ -48,7 +53,8 @@ class Crawler {
|
||||
$stmt->execute([$this->crawlJobId]);
|
||||
}
|
||||
|
||||
private function addToQueue(string $url, int $depth): void {
|
||||
private function addToQueue(string $url, int $depth): void
|
||||
{
|
||||
if (isset($this->visited[$url])) {
|
||||
return;
|
||||
}
|
||||
@@ -63,7 +69,8 @@ class Crawler {
|
||||
}
|
||||
}
|
||||
|
||||
private function processQueue(): void {
|
||||
private function processQueue(): void
|
||||
{
|
||||
while (true) {
|
||||
// Get pending URLs
|
||||
$stmt = $this->db->prepare(
|
||||
@@ -82,14 +89,18 @@ class Crawler {
|
||||
}
|
||||
}
|
||||
|
||||
private function crawlBatch(array $urls): void {
|
||||
$requests = function() use ($urls) {
|
||||
/**
|
||||
* @param array<int, array{id: int, url: string, depth: int}> $urls
|
||||
*/
|
||||
private function crawlBatch(array $urls): void
|
||||
{
|
||||
$requests = function () use ($urls) {
|
||||
foreach ($urls as $item) {
|
||||
// Mark as processing
|
||||
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?");
|
||||
$stmt->execute([$item['id']]);
|
||||
|
||||
yield function() use ($item) {
|
||||
yield function () use ($item) {
|
||||
return $this->client->getAsync($item['url']);
|
||||
};
|
||||
}
|
||||
@@ -110,7 +121,12 @@ class Crawler {
|
||||
$pool->promise()->wait();
|
||||
}
|
||||
|
||||
private function handleResponse(array $queueItem, $response): void {
|
||||
/**
|
||||
* @param array{id: int, url: string, depth: int} $queueItem
|
||||
* @param \Psr\Http\Message\ResponseInterface $response
|
||||
*/
|
||||
private function handleResponse(array $queueItem, $response): void
|
||||
{
|
||||
$url = $queueItem['url'];
|
||||
$depth = $queueItem['depth'];
|
||||
|
||||
@@ -143,7 +159,7 @@ class Crawler {
|
||||
}
|
||||
|
||||
// Extract and save links
|
||||
if (str_contains($contentType, 'text/html')) {
|
||||
if (str_contains($contentType, 'text/html') && is_int($pageId)) {
|
||||
echo "Extracting links from: $url (pageId: $pageId)\n";
|
||||
$this->extractLinks($domCrawler, $url, $pageId, $depth);
|
||||
} else {
|
||||
@@ -155,7 +171,8 @@ class Crawler {
|
||||
$stmt->execute([$queueItem['id']]);
|
||||
}
|
||||
|
||||
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void {
|
||||
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void
|
||||
{
|
||||
$linkCount = 0;
|
||||
$crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) {
|
||||
try {
|
||||
@@ -176,7 +193,8 @@ class Crawler {
|
||||
$isNofollow = str_contains($rel, 'nofollow');
|
||||
|
||||
// Check if internal (same domain, no subdomains)
|
||||
$targetDomain = strtolower(parse_url($targetUrl, PHP_URL_HOST) ?? '');
|
||||
$targetHost = parse_url($targetUrl, PHP_URL_HOST);
|
||||
$targetDomain = strtolower($targetHost ?: '');
|
||||
$isInternal = ($targetDomain === $this->baseDomain);
|
||||
|
||||
// Save link
|
||||
@@ -207,7 +225,8 @@ class Crawler {
|
||||
echo "Processed $linkCount links from $sourceUrl\n";
|
||||
}
|
||||
|
||||
private function makeAbsoluteUrl(string $url, string $base): string {
|
||||
private function makeAbsoluteUrl(string $url, string $base): string
|
||||
{
|
||||
if (filter_var($url, FILTER_VALIDATE_URL)) {
|
||||
return $url;
|
||||
}
|
||||
@@ -225,14 +244,20 @@ class Crawler {
|
||||
return "$scheme://$host$basePath$url";
|
||||
}
|
||||
|
||||
private function handleError(array $queueItem, $reason): void {
|
||||
/**
|
||||
* @param array{id: int, url: string, depth: int} $queueItem
|
||||
* @param \GuzzleHttp\Exception\RequestException $reason
|
||||
*/
|
||||
private function handleError(array $queueItem, $reason): void
|
||||
{
|
||||
$stmt = $this->db->prepare(
|
||||
"UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?"
|
||||
);
|
||||
$stmt->execute([$queueItem['id']]);
|
||||
}
|
||||
|
||||
private function updateJobStats(): void {
|
||||
private function updateJobStats(): void
|
||||
{
|
||||
$stmt = $this->db->prepare(
|
||||
"UPDATE crawl_jobs SET
|
||||
total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?),
|
||||
@@ -242,7 +267,8 @@ class Crawler {
|
||||
$stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]);
|
||||
}
|
||||
|
||||
private function normalizeUrl(string $url): string {
|
||||
private function normalizeUrl(string $url): string
|
||||
{
|
||||
// Parse URL
|
||||
$parts = parse_url($url);
|
||||
|
||||
|
||||
@@ -5,12 +5,16 @@ namespace App;
|
||||
use PDO;
|
||||
use PDOException;
|
||||
|
||||
class Database {
|
||||
class Database
|
||||
{
|
||||
private static ?PDO $instance = null;
|
||||
|
||||
private function __construct() {}
|
||||
private function __construct()
|
||||
{
|
||||
}
|
||||
|
||||
public static function getInstance(): PDO {
|
||||
public static function getInstance(): PDO
|
||||
{
|
||||
if (self::$instance === null) {
|
||||
try {
|
||||
self::$instance = new PDO(
|
||||
|
||||
@@ -9,7 +9,9 @@
|
||||
"symfony/css-selector": "^7.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^11.0"
|
||||
"phpunit/phpunit": "^11.0",
|
||||
"phpstan/phpstan": "^2.1",
|
||||
"squizlabs/php_codesniffer": "^4.0"
|
||||
},
|
||||
"autoload": {
|
||||
"psr-4": {
|
||||
@@ -22,6 +24,9 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"test": "phpunit"
|
||||
"test": "phpunit",
|
||||
"phpstan": "phpstan analyse -c ../phpstan.neon --memory-limit=512M",
|
||||
"phpcs": "phpcs --standard=PSR12 --ignore=/var/www/html/vendor /var/www/html /var/www/tests",
|
||||
"phpcbf": "phpcbf --standard=PSR12 --ignore=/var/www/html/vendor /var/www/html /var/www/tests"
|
||||
}
|
||||
}
|
||||
|
||||
134
src/composer.lock
generated
134
src/composer.lock
generated
@@ -4,7 +4,7 @@
|
||||
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
|
||||
"This file is @generated automatically"
|
||||
],
|
||||
"content-hash": "96376d6cdbd0e0665e091abe3e0ef8d8",
|
||||
"content-hash": "bb0d5fc291c18a44bfc693b94b302357",
|
||||
"packages": [
|
||||
{
|
||||
"name": "guzzlehttp/guzzle",
|
||||
@@ -1211,6 +1211,59 @@
|
||||
},
|
||||
"time": "2022-02-21T01:04:05+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phpstan/phpstan",
|
||||
"version": "2.1.30",
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/phpstan/phpstan/zipball/a4a7f159927983dd4f7c8020ed227d80b7f39d7d",
|
||||
"reference": "a4a7f159927983dd4f7c8020ed227d80b7f39d7d",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"php": "^7.4|^8.0"
|
||||
},
|
||||
"conflict": {
|
||||
"phpstan/phpstan-shim": "*"
|
||||
},
|
||||
"bin": [
|
||||
"phpstan",
|
||||
"phpstan.phar"
|
||||
],
|
||||
"type": "library",
|
||||
"autoload": {
|
||||
"files": [
|
||||
"bootstrap.php"
|
||||
]
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"MIT"
|
||||
],
|
||||
"description": "PHPStan - PHP Static Analysis Tool",
|
||||
"keywords": [
|
||||
"dev",
|
||||
"static analysis"
|
||||
],
|
||||
"support": {
|
||||
"docs": "https://phpstan.org/user-guide/getting-started",
|
||||
"forum": "https://github.com/phpstan/phpstan/discussions",
|
||||
"issues": "https://github.com/phpstan/phpstan/issues",
|
||||
"security": "https://github.com/phpstan/phpstan/security/policy",
|
||||
"source": "https://github.com/phpstan/phpstan-src"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
"url": "https://github.com/ondrejmirtes",
|
||||
"type": "github"
|
||||
},
|
||||
{
|
||||
"url": "https://github.com/phpstan",
|
||||
"type": "github"
|
||||
}
|
||||
],
|
||||
"time": "2025-10-02T16:07:52+00:00"
|
||||
},
|
||||
{
|
||||
"name": "phpunit/php-code-coverage",
|
||||
"version": "11.0.11",
|
||||
@@ -2641,6 +2694,85 @@
|
||||
],
|
||||
"time": "2024-10-09T05:16:32+00:00"
|
||||
},
|
||||
{
|
||||
"name": "squizlabs/php_codesniffer",
|
||||
"version": "4.0.0",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/PHPCSStandards/PHP_CodeSniffer.git",
|
||||
"reference": "06113cfdaf117fc2165f9cd040bd0f17fcd5242d"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/PHPCSStandards/PHP_CodeSniffer/zipball/06113cfdaf117fc2165f9cd040bd0f17fcd5242d",
|
||||
"reference": "06113cfdaf117fc2165f9cd040bd0f17fcd5242d",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"ext-simplexml": "*",
|
||||
"ext-tokenizer": "*",
|
||||
"ext-xmlwriter": "*",
|
||||
"php": ">=7.2.0"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^8.4.0 || ^9.3.4 || ^10.5.32 || 11.3.3 - 11.5.28 || ^11.5.31"
|
||||
},
|
||||
"bin": [
|
||||
"bin/phpcbf",
|
||||
"bin/phpcs"
|
||||
],
|
||||
"type": "library",
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"BSD-3-Clause"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Greg Sherwood",
|
||||
"role": "Former lead"
|
||||
},
|
||||
{
|
||||
"name": "Juliette Reinders Folmer",
|
||||
"role": "Current lead"
|
||||
},
|
||||
{
|
||||
"name": "Contributors",
|
||||
"homepage": "https://github.com/PHPCSStandards/PHP_CodeSniffer/graphs/contributors"
|
||||
}
|
||||
],
|
||||
"description": "PHP_CodeSniffer tokenizes PHP files and detects violations of a defined set of coding standards.",
|
||||
"homepage": "https://github.com/PHPCSStandards/PHP_CodeSniffer",
|
||||
"keywords": [
|
||||
"phpcs",
|
||||
"standards",
|
||||
"static analysis"
|
||||
],
|
||||
"support": {
|
||||
"issues": "https://github.com/PHPCSStandards/PHP_CodeSniffer/issues",
|
||||
"security": "https://github.com/PHPCSStandards/PHP_CodeSniffer/security/policy",
|
||||
"source": "https://github.com/PHPCSStandards/PHP_CodeSniffer",
|
||||
"wiki": "https://github.com/PHPCSStandards/PHP_CodeSniffer/wiki"
|
||||
},
|
||||
"funding": [
|
||||
{
|
||||
"url": "https://github.com/PHPCSStandards",
|
||||
"type": "github"
|
||||
},
|
||||
{
|
||||
"url": "https://github.com/jrfnl",
|
||||
"type": "github"
|
||||
},
|
||||
{
|
||||
"url": "https://opencollective.com/php_codesniffer",
|
||||
"type": "open_collective"
|
||||
},
|
||||
{
|
||||
"url": "https://thanks.dev/u/gh/phpcsstandards",
|
||||
"type": "thanks_dev"
|
||||
}
|
||||
],
|
||||
"time": "2025-09-15T11:28:58+00:00"
|
||||
},
|
||||
{
|
||||
"name": "staabm/side-effects-detector",
|
||||
"version": "1.0.5",
|
||||
|
||||
Reference in New Issue
Block a user