Add comprehensive quality tooling and fix code style issues

Quality Tools Added:
- PHPStan (Level 8) for static analysis
- PHP_CodeSniffer (PSR-12) for code style
- Updated PHPUnit test suite with type safety

Code Improvements:
- Fixed all PHPStan Level 8 errors (13 issues)
- Auto-fixed 25 PSR-12 code style violations
- Added proper type hints for arrays and method parameters
- Fixed PDOStatement|false handling in api.php and tests
- Improved null-safety for parse_url() calls

Configuration:
- phpstan.neon: Level 8, analyzes src/ and tests/
- phpcs.xml: PSR-12 standard, excludes vendor/
- docker-compose.yml: Mount config files for tooling
- composer.json: Add phpstan, phpcs, phpcbf scripts

Documentation:
- Updated README.md with testing and quality sections
- Updated AGENTS.md with quality gates and workflows
- Added pre-commit checklist for developers

All tests pass (9/9), PHPStan clean (0 errors), PHPCS compliant (1 warning)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-03 23:58:21 +02:00
parent b5640ad131
commit e569d189d5
13 changed files with 378 additions and 27 deletions

View File

@@ -73,6 +73,9 @@ try {
case 'jobs':
$stmt = $db->query("SELECT * FROM crawl_jobs ORDER BY created_at DESC LIMIT 50");
if ($stmt === false) {
throw new Exception('Failed to query jobs');
}
$jobs = $stmt->fetchAll();
echo json_encode([

View File

@@ -8,15 +8,18 @@ use GuzzleHttp\Psr7\Request;
use GuzzleHttp\Exception\RequestException;
use Symfony\Component\DomCrawler\Crawler as DomCrawler;
class Crawler {
class Crawler
{
private \PDO $db;
private Client $client;
private int $concurrency = 10; // Parallel requests
/** @var array<string, bool> */
private array $visited = [];
private int $crawlJobId;
private string $baseDomain;
public function __construct(int $crawlJobId) {
public function __construct(int $crawlJobId)
{
$this->db = Database::getInstance();
$this->crawlJobId = $crawlJobId;
$this->client = new Client([
@@ -28,8 +31,10 @@ class Crawler {
]);
}
public function start(string $startUrl): void {
$this->baseDomain = strtolower(parse_url($startUrl, PHP_URL_HOST));
public function start(string $startUrl): void
{
$host = parse_url($startUrl, PHP_URL_HOST);
$this->baseDomain = strtolower($host ?: '');
// Update job status
$stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?");
@@ -48,7 +53,8 @@ class Crawler {
$stmt->execute([$this->crawlJobId]);
}
private function addToQueue(string $url, int $depth): void {
private function addToQueue(string $url, int $depth): void
{
if (isset($this->visited[$url])) {
return;
}
@@ -63,7 +69,8 @@ class Crawler {
}
}
private function processQueue(): void {
private function processQueue(): void
{
while (true) {
// Get pending URLs
$stmt = $this->db->prepare(
@@ -82,14 +89,18 @@ class Crawler {
}
}
private function crawlBatch(array $urls): void {
$requests = function() use ($urls) {
/**
* @param array<int, array{id: int, url: string, depth: int}> $urls
*/
private function crawlBatch(array $urls): void
{
$requests = function () use ($urls) {
foreach ($urls as $item) {
// Mark as processing
$stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?");
$stmt->execute([$item['id']]);
yield function() use ($item) {
yield function () use ($item) {
return $this->client->getAsync($item['url']);
};
}
@@ -110,7 +121,12 @@ class Crawler {
$pool->promise()->wait();
}
private function handleResponse(array $queueItem, $response): void {
/**
* @param array{id: int, url: string, depth: int} $queueItem
* @param \Psr\Http\Message\ResponseInterface $response
*/
private function handleResponse(array $queueItem, $response): void
{
$url = $queueItem['url'];
$depth = $queueItem['depth'];
@@ -143,7 +159,7 @@ class Crawler {
}
// Extract and save links
if (str_contains($contentType, 'text/html')) {
if (str_contains($contentType, 'text/html') && is_int($pageId)) {
echo "Extracting links from: $url (pageId: $pageId)\n";
$this->extractLinks($domCrawler, $url, $pageId, $depth);
} else {
@@ -155,7 +171,8 @@ class Crawler {
$stmt->execute([$queueItem['id']]);
}
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void {
private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void
{
$linkCount = 0;
$crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) {
try {
@@ -176,7 +193,8 @@ class Crawler {
$isNofollow = str_contains($rel, 'nofollow');
// Check if internal (same domain, no subdomains)
$targetDomain = strtolower(parse_url($targetUrl, PHP_URL_HOST) ?? '');
$targetHost = parse_url($targetUrl, PHP_URL_HOST);
$targetDomain = strtolower($targetHost ?: '');
$isInternal = ($targetDomain === $this->baseDomain);
// Save link
@@ -207,7 +225,8 @@ class Crawler {
echo "Processed $linkCount links from $sourceUrl\n";
}
private function makeAbsoluteUrl(string $url, string $base): string {
private function makeAbsoluteUrl(string $url, string $base): string
{
if (filter_var($url, FILTER_VALIDATE_URL)) {
return $url;
}
@@ -225,14 +244,20 @@ class Crawler {
return "$scheme://$host$basePath$url";
}
private function handleError(array $queueItem, $reason): void {
/**
* @param array{id: int, url: string, depth: int} $queueItem
* @param \GuzzleHttp\Exception\RequestException $reason
*/
private function handleError(array $queueItem, $reason): void
{
$stmt = $this->db->prepare(
"UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?"
);
$stmt->execute([$queueItem['id']]);
}
private function updateJobStats(): void {
private function updateJobStats(): void
{
$stmt = $this->db->prepare(
"UPDATE crawl_jobs SET
total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?),
@@ -242,7 +267,8 @@ class Crawler {
$stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]);
}
private function normalizeUrl(string $url): string {
private function normalizeUrl(string $url): string
{
// Parse URL
$parts = parse_url($url);

View File

@@ -5,12 +5,16 @@ namespace App;
use PDO;
use PDOException;
class Database {
class Database
{
private static ?PDO $instance = null;
private function __construct() {}
private function __construct()
{
}
public static function getInstance(): PDO {
public static function getInstance(): PDO
{
if (self::$instance === null) {
try {
self::$instance = new PDO(

View File

@@ -9,7 +9,9 @@
"symfony/css-selector": "^7.0"
},
"require-dev": {
"phpunit/phpunit": "^11.0"
"phpunit/phpunit": "^11.0",
"phpstan/phpstan": "^2.1",
"squizlabs/php_codesniffer": "^4.0"
},
"autoload": {
"psr-4": {
@@ -22,6 +24,9 @@
}
},
"scripts": {
"test": "phpunit"
"test": "phpunit",
"phpstan": "phpstan analyse -c ../phpstan.neon --memory-limit=512M",
"phpcs": "phpcs --standard=PSR12 --ignore=/var/www/html/vendor /var/www/html /var/www/tests",
"phpcbf": "phpcbf --standard=PSR12 --ignore=/var/www/html/vendor /var/www/html /var/www/tests"
}
}

134
src/composer.lock generated
View File

@@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "96376d6cdbd0e0665e091abe3e0ef8d8",
"content-hash": "bb0d5fc291c18a44bfc693b94b302357",
"packages": [
{
"name": "guzzlehttp/guzzle",
@@ -1211,6 +1211,59 @@
},
"time": "2022-02-21T01:04:05+00:00"
},
{
"name": "phpstan/phpstan",
"version": "2.1.30",
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/phpstan/phpstan/zipball/a4a7f159927983dd4f7c8020ed227d80b7f39d7d",
"reference": "a4a7f159927983dd4f7c8020ed227d80b7f39d7d",
"shasum": ""
},
"require": {
"php": "^7.4|^8.0"
},
"conflict": {
"phpstan/phpstan-shim": "*"
},
"bin": [
"phpstan",
"phpstan.phar"
],
"type": "library",
"autoload": {
"files": [
"bootstrap.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"description": "PHPStan - PHP Static Analysis Tool",
"keywords": [
"dev",
"static analysis"
],
"support": {
"docs": "https://phpstan.org/user-guide/getting-started",
"forum": "https://github.com/phpstan/phpstan/discussions",
"issues": "https://github.com/phpstan/phpstan/issues",
"security": "https://github.com/phpstan/phpstan/security/policy",
"source": "https://github.com/phpstan/phpstan-src"
},
"funding": [
{
"url": "https://github.com/ondrejmirtes",
"type": "github"
},
{
"url": "https://github.com/phpstan",
"type": "github"
}
],
"time": "2025-10-02T16:07:52+00:00"
},
{
"name": "phpunit/php-code-coverage",
"version": "11.0.11",
@@ -2641,6 +2694,85 @@
],
"time": "2024-10-09T05:16:32+00:00"
},
{
"name": "squizlabs/php_codesniffer",
"version": "4.0.0",
"source": {
"type": "git",
"url": "https://github.com/PHPCSStandards/PHP_CodeSniffer.git",
"reference": "06113cfdaf117fc2165f9cd040bd0f17fcd5242d"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/PHPCSStandards/PHP_CodeSniffer/zipball/06113cfdaf117fc2165f9cd040bd0f17fcd5242d",
"reference": "06113cfdaf117fc2165f9cd040bd0f17fcd5242d",
"shasum": ""
},
"require": {
"ext-simplexml": "*",
"ext-tokenizer": "*",
"ext-xmlwriter": "*",
"php": ">=7.2.0"
},
"require-dev": {
"phpunit/phpunit": "^8.4.0 || ^9.3.4 || ^10.5.32 || 11.3.3 - 11.5.28 || ^11.5.31"
},
"bin": [
"bin/phpcbf",
"bin/phpcs"
],
"type": "library",
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"authors": [
{
"name": "Greg Sherwood",
"role": "Former lead"
},
{
"name": "Juliette Reinders Folmer",
"role": "Current lead"
},
{
"name": "Contributors",
"homepage": "https://github.com/PHPCSStandards/PHP_CodeSniffer/graphs/contributors"
}
],
"description": "PHP_CodeSniffer tokenizes PHP files and detects violations of a defined set of coding standards.",
"homepage": "https://github.com/PHPCSStandards/PHP_CodeSniffer",
"keywords": [
"phpcs",
"standards",
"static analysis"
],
"support": {
"issues": "https://github.com/PHPCSStandards/PHP_CodeSniffer/issues",
"security": "https://github.com/PHPCSStandards/PHP_CodeSniffer/security/policy",
"source": "https://github.com/PHPCSStandards/PHP_CodeSniffer",
"wiki": "https://github.com/PHPCSStandards/PHP_CodeSniffer/wiki"
},
"funding": [
{
"url": "https://github.com/PHPCSStandards",
"type": "github"
},
{
"url": "https://github.com/jrfnl",
"type": "github"
},
{
"url": "https://opencollective.com/php_codesniffer",
"type": "open_collective"
},
{
"url": "https://thanks.dev/u/gh/phpcsstandards",
"type": "thanks_dev"
}
],
"time": "2025-09-15T11:28:58+00:00"
},
{
"name": "staabm/side-effects-detector",
"version": "1.0.5",