From e569d189d52a292e61dd5cdaeaea43d6c5454b40 Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 3 Oct 2025 23:58:21 +0200 Subject: [PATCH] Add comprehensive quality tooling and fix code style issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quality Tools Added: - PHPStan (Level 8) for static analysis - PHP_CodeSniffer (PSR-12) for code style - Updated PHPUnit test suite with type safety Code Improvements: - Fixed all PHPStan Level 8 errors (13 issues) - Auto-fixed 25 PSR-12 code style violations - Added proper type hints for arrays and method parameters - Fixed PDOStatement|false handling in api.php and tests - Improved null-safety for parse_url() calls Configuration: - phpstan.neon: Level 8, analyzes src/ and tests/ - phpcs.xml: PSR-12 standard, excludes vendor/ - docker-compose.yml: Mount config files for tooling - composer.json: Add phpstan, phpcs, phpcbf scripts Documentation: - Updated README.md with testing and quality sections - Updated AGENTS.md with quality gates and workflows - Added pre-commit checklist for developers All tests pass (9/9), PHPStan clean (0 errors), PHPCS compliant (1 warning) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- AGENTS.md | 100 +++++++++++++- README.md | 54 ++++++++ docker-compose.yml | 2 + phpcs.xml | 19 +++ phpstan.neon | 7 + src/api.php | 3 + src/classes/Crawler.php | 60 ++++++--- src/classes/Database.php | 10 +- src/composer.json | 9 +- src/composer.lock | 134 ++++++++++++++++++- tests/Integration/CrawlerIntegrationTest.php | 3 +- tests/Unit/CrawlerTest.php | 3 +- tests/Unit/DatabaseTest.php | 1 + 13 files changed, 378 insertions(+), 27 deletions(-) create mode 100644 phpcs.xml create mode 100644 phpstan.neon diff --git a/AGENTS.md b/AGENTS.md index 8b3c3af..9cb8be1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,13 +4,109 @@ The codebase is intentionally lean. `index.php` bootstraps the crawl by instantiating `webanalyse` and handing off the crawl identifier. Core crawling logic lives in `webanalyse.php`, which houses HTTP fetching, link extraction, and database persistence. Use `setnew.php` to reset seed data inside the `screaming_frog` schema before a rerun. Keep new helpers in their own PHP files under this root so the autoload includes stay predictable; group SQL migrations or fixtures under a `database/` folder if you add them. IDE settings reside in `.idea/`. ## Build, Test, and Development Commands -Run the project through Apache in XAMPP or start the PHP built-in server with `php -S localhost:8080 index.php` from this directory. Validate syntax quickly via `php -l webanalyse.php` (repeat for any new file). When iterating on crawl logic, truncate runtime tables with `php setnew.php` to restore the baseline dataset. + +### Docker Development +The project runs in Docker containers. Use these commands: + +```bash +# Start containers +docker-compose up -d + +# Stop containers +docker-compose down + +# Rebuild containers +docker-compose up -d --build + +# View logs +docker-compose logs -f php +``` + +### Running Tests +The project uses PHPUnit for automated testing: + +```bash +# Run all tests (Unit + Integration) +docker-compose exec php sh -c "php /var/www/html/vendor/bin/phpunit /var/www/tests/" + +# Or use the composer shortcut +docker-compose exec php composer test +``` + +**Test Structure:** +- `tests/Unit/` - Unit tests for individual components +- `tests/Integration/` - Integration tests for full crawl workflows +- All tests run in isolated database transactions + +### Static Code Analysis +PHPStan is configured at Level 8 (strictest) to ensure type safety: + +```bash +# Run PHPStan analysis +docker-compose exec php sh -c "php -d memory_limit=512M /var/www/html/vendor/bin/phpstan analyse -c /var/www/phpstan.neon" + +# Or use the composer shortcut +docker-compose exec php composer phpstan +``` + +**PHPStan Configuration:** +- Level: 8 (maximum strictness) +- Analyzes: `src/` and `tests/` +- Excludes: `vendor/` +- Config file: `phpstan.neon` + +All code must pass PHPStan Level 8 with zero errors before merging. + +### Code Style Checking +PHP_CodeSniffer enforces PSR-12 coding standards: + +```bash +# Check code style +docker-compose exec php composer phpcs + +# Automatically fix code style issues +docker-compose exec php composer phpcbf +``` + +**PHPCS Configuration:** +- Standard: PSR-12 +- Analyzes: `src/` and `tests/` +- Excludes: `vendor/` +- Auto-fix available via `phpcbf` + +Run `phpcbf` before committing to automatically fix most style violations. ## Coding Style & Naming Conventions Follow PSR-12 style cues already in use: 4-space indentation, brace-on-new-line for functions, and `declare(strict_types=1);` at the top of entry scripts. Favour descriptive camelCase for methods (`getMultipleWebsites`) and snake_case only for direct SQL field names. Maintain `mysqli` usage for consistency, and gate new configuration through constants or clearly named environment variables. ## Testing Guidelines -There is no automated suite yet; treat each crawl as an integration test. After code changes, run `php setnew.php` followed by a crawl and confirm that `crawl`, `urls`, and `links` tables reflect the expected row counts. Log anomalies with `error_log()` while developing, and remove or downgrade to structured responses before merging. + +### Automated Testing +The project has a comprehensive test suite using PHPUnit: + +- **Write tests first**: Follow TDD principles when adding new features +- **Unit tests** (`tests/Unit/`): Test individual classes and methods in isolation +- **Integration tests** (`tests/Integration/`): Test full crawl workflows with real HTTP requests +- **Database isolation**: Tests use transactions that roll back automatically +- **Coverage**: Aim for high test coverage on critical crawl logic + +### Quality Gates +Before committing code, ensure: +1. All tests pass: `docker-compose exec php composer test` +2. PHPStan analysis passes: `docker-compose exec php composer phpstan` +3. Code style is correct: `docker-compose exec php composer phpcs` +4. Auto-fix style issues: `docker-compose exec php composer phpcbf` + +**Pre-commit Checklist:** +- ✅ Tests pass +- ✅ PHPStan Level 8 with 0 errors +- ✅ PHPCS PSR-12 compliance (warnings acceptable) + +### Manual Testing +For UI changes, manually test the crawler interface at http://localhost:8080. Verify: +- Job creation and status updates +- Page and link extraction accuracy +- Error handling for invalid URLs or network issues ## Commit & Pull Request Guidelines Author commit messages in the present tense with a concise summary (`Add link grouping for external URLs`). Group related SQL adjustments with their PHP changes in the same commit. For pull requests, include: a short context paragraph, reproduction steps, screenshots of key output tables when behaviour changes, and any follow-up tasks. Link tracking tickets or issues so downstream agents can trace decisions. diff --git a/README.md b/README.md index 4516d23..d903925 100644 --- a/README.md +++ b/README.md @@ -56,3 +56,57 @@ docker-compose up -d --build ## Entwicklung Die Anwendungsdateien befinden sich im `src/` Verzeichnis und werden als Volume in den Container gemountet, sodass Änderungen sofort sichtbar sind. + +## Tests & Code-Qualität + +### Unit Tests ausführen + +Die Anwendung verwendet PHPUnit für Unit- und Integrationstests: + +```bash +# Alle Tests ausführen +docker-compose exec php sh -c "php /var/www/html/vendor/bin/phpunit /var/www/tests/" + +# Alternative mit Composer-Script +docker-compose exec php composer test +``` + +Die Tests befinden sich in: +- `tests/Unit/` - Unit Tests +- `tests/Integration/` - Integration Tests + +### Statische Code-Analyse mit PHPStan + +PHPStan ist auf Level 8 (höchstes Level) konfiguriert und analysiert den gesamten Code: + +```bash +# PHPStan ausführen +docker-compose exec php sh -c "php -d memory_limit=512M /var/www/html/vendor/bin/phpstan analyse -c /var/www/phpstan.neon" + +# Alternative mit Composer-Script +docker-compose exec php composer phpstan +``` + +**PHPStan Konfiguration:** +- Level: 8 (strictest) +- Analysierte Pfade: `src/` und `tests/` +- Ausgeschlossen: `vendor/` Ordner +- Konfigurationsdatei: `phpstan.neon` + +### Code Style Prüfung mit PHP_CodeSniffer + +PHP_CodeSniffer (PHPCS) prüft den Code gegen PSR-12 Standards: + +```bash +# Code Style prüfen +docker-compose exec php composer phpcs + +# Code Style automatisch korrigieren +docker-compose exec php composer phpcbf +``` + +**PHPCS Konfiguration:** +- Standard: PSR-12 +- Analysierte Pfade: `src/` und `tests/` +- Ausgeschlossen: `vendor/` Ordner +- Auto-Fix verfügbar mit `phpcbf` diff --git a/docker-compose.yml b/docker-compose.yml index 6694709..6f89cad 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -13,6 +13,8 @@ services: - ./tests:/var/www/tests - ./composer.json:/var/www/composer.json - ./composer.lock:/var/www/composer.lock + - ./phpstan.neon:/var/www/phpstan.neon + - ./phpcs.xml:/var/www/phpcs.xml - ./config/nginx/default.conf:/etc/nginx/conf.d/default.conf depends_on: - mariadb diff --git a/phpcs.xml b/phpcs.xml new file mode 100644 index 0000000..923032e --- /dev/null +++ b/phpcs.xml @@ -0,0 +1,19 @@ + + + PHP_CodeSniffer configuration + + + + + + /var/www/html + /var/www/tests + + + /var/www/html/vendor/* + */vendor/* + + + + + diff --git a/phpstan.neon b/phpstan.neon new file mode 100644 index 0000000..9b227e6 --- /dev/null +++ b/phpstan.neon @@ -0,0 +1,7 @@ +parameters: + level: 8 + paths: + - /var/www/html + - /var/www/tests + excludePaths: + - /var/www/html/vendor diff --git a/src/api.php b/src/api.php index 9d6f074..6d25318 100644 --- a/src/api.php +++ b/src/api.php @@ -73,6 +73,9 @@ try { case 'jobs': $stmt = $db->query("SELECT * FROM crawl_jobs ORDER BY created_at DESC LIMIT 50"); + if ($stmt === false) { + throw new Exception('Failed to query jobs'); + } $jobs = $stmt->fetchAll(); echo json_encode([ diff --git a/src/classes/Crawler.php b/src/classes/Crawler.php index 55d30ec..548a19f 100644 --- a/src/classes/Crawler.php +++ b/src/classes/Crawler.php @@ -8,15 +8,18 @@ use GuzzleHttp\Psr7\Request; use GuzzleHttp\Exception\RequestException; use Symfony\Component\DomCrawler\Crawler as DomCrawler; -class Crawler { +class Crawler +{ private \PDO $db; private Client $client; private int $concurrency = 10; // Parallel requests + /** @var array */ private array $visited = []; private int $crawlJobId; private string $baseDomain; - public function __construct(int $crawlJobId) { + public function __construct(int $crawlJobId) + { $this->db = Database::getInstance(); $this->crawlJobId = $crawlJobId; $this->client = new Client([ @@ -28,8 +31,10 @@ class Crawler { ]); } - public function start(string $startUrl): void { - $this->baseDomain = strtolower(parse_url($startUrl, PHP_URL_HOST)); + public function start(string $startUrl): void + { + $host = parse_url($startUrl, PHP_URL_HOST); + $this->baseDomain = strtolower($host ?: ''); // Update job status $stmt = $this->db->prepare("UPDATE crawl_jobs SET status = 'running', started_at = NOW() WHERE id = ?"); @@ -48,7 +53,8 @@ class Crawler { $stmt->execute([$this->crawlJobId]); } - private function addToQueue(string $url, int $depth): void { + private function addToQueue(string $url, int $depth): void + { if (isset($this->visited[$url])) { return; } @@ -63,7 +69,8 @@ class Crawler { } } - private function processQueue(): void { + private function processQueue(): void + { while (true) { // Get pending URLs $stmt = $this->db->prepare( @@ -82,14 +89,18 @@ class Crawler { } } - private function crawlBatch(array $urls): void { - $requests = function() use ($urls) { + /** + * @param array $urls + */ + private function crawlBatch(array $urls): void + { + $requests = function () use ($urls) { foreach ($urls as $item) { // Mark as processing $stmt = $this->db->prepare("UPDATE crawl_queue SET status = 'processing' WHERE id = ?"); $stmt->execute([$item['id']]); - yield function() use ($item) { + yield function () use ($item) { return $this->client->getAsync($item['url']); }; } @@ -110,7 +121,12 @@ class Crawler { $pool->promise()->wait(); } - private function handleResponse(array $queueItem, $response): void { + /** + * @param array{id: int, url: string, depth: int} $queueItem + * @param \Psr\Http\Message\ResponseInterface $response + */ + private function handleResponse(array $queueItem, $response): void + { $url = $queueItem['url']; $depth = $queueItem['depth']; @@ -143,7 +159,7 @@ class Crawler { } // Extract and save links - if (str_contains($contentType, 'text/html')) { + if (str_contains($contentType, 'text/html') && is_int($pageId)) { echo "Extracting links from: $url (pageId: $pageId)\n"; $this->extractLinks($domCrawler, $url, $pageId, $depth); } else { @@ -155,7 +171,8 @@ class Crawler { $stmt->execute([$queueItem['id']]); } - private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void { + private function extractLinks(DomCrawler $crawler, string $sourceUrl, int $pageId, int $depth): void + { $linkCount = 0; $crawler->filter('a')->each(function (DomCrawler $node) use ($sourceUrl, $pageId, $depth, &$linkCount) { try { @@ -176,7 +193,8 @@ class Crawler { $isNofollow = str_contains($rel, 'nofollow'); // Check if internal (same domain, no subdomains) - $targetDomain = strtolower(parse_url($targetUrl, PHP_URL_HOST) ?? ''); + $targetHost = parse_url($targetUrl, PHP_URL_HOST); + $targetDomain = strtolower($targetHost ?: ''); $isInternal = ($targetDomain === $this->baseDomain); // Save link @@ -207,7 +225,8 @@ class Crawler { echo "Processed $linkCount links from $sourceUrl\n"; } - private function makeAbsoluteUrl(string $url, string $base): string { + private function makeAbsoluteUrl(string $url, string $base): string + { if (filter_var($url, FILTER_VALIDATE_URL)) { return $url; } @@ -225,14 +244,20 @@ class Crawler { return "$scheme://$host$basePath$url"; } - private function handleError(array $queueItem, $reason): void { + /** + * @param array{id: int, url: string, depth: int} $queueItem + * @param \GuzzleHttp\Exception\RequestException $reason + */ + private function handleError(array $queueItem, $reason): void + { $stmt = $this->db->prepare( "UPDATE crawl_queue SET status = 'failed', processed_at = NOW(), retry_count = retry_count + 1 WHERE id = ?" ); $stmt->execute([$queueItem['id']]); } - private function updateJobStats(): void { + private function updateJobStats(): void + { $stmt = $this->db->prepare( "UPDATE crawl_jobs SET total_pages = (SELECT COUNT(*) FROM pages WHERE crawl_job_id = ?), @@ -242,7 +267,8 @@ class Crawler { $stmt->execute([$this->crawlJobId, $this->crawlJobId, $this->crawlJobId]); } - private function normalizeUrl(string $url): string { + private function normalizeUrl(string $url): string + { // Parse URL $parts = parse_url($url); diff --git a/src/classes/Database.php b/src/classes/Database.php index 9b95aba..9196992 100644 --- a/src/classes/Database.php +++ b/src/classes/Database.php @@ -5,12 +5,16 @@ namespace App; use PDO; use PDOException; -class Database { +class Database +{ private static ?PDO $instance = null; - private function __construct() {} + private function __construct() + { + } - public static function getInstance(): PDO { + public static function getInstance(): PDO + { if (self::$instance === null) { try { self::$instance = new PDO( diff --git a/src/composer.json b/src/composer.json index 9e241a1..bf9f07a 100644 --- a/src/composer.json +++ b/src/composer.json @@ -9,7 +9,9 @@ "symfony/css-selector": "^7.0" }, "require-dev": { - "phpunit/phpunit": "^11.0" + "phpunit/phpunit": "^11.0", + "phpstan/phpstan": "^2.1", + "squizlabs/php_codesniffer": "^4.0" }, "autoload": { "psr-4": { @@ -22,6 +24,9 @@ } }, "scripts": { - "test": "phpunit" + "test": "phpunit", + "phpstan": "phpstan analyse -c ../phpstan.neon --memory-limit=512M", + "phpcs": "phpcs --standard=PSR12 --ignore=/var/www/html/vendor /var/www/html /var/www/tests", + "phpcbf": "phpcbf --standard=PSR12 --ignore=/var/www/html/vendor /var/www/html /var/www/tests" } } diff --git a/src/composer.lock b/src/composer.lock index b4cb5dd..c345586 100644 --- a/src/composer.lock +++ b/src/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "96376d6cdbd0e0665e091abe3e0ef8d8", + "content-hash": "bb0d5fc291c18a44bfc693b94b302357", "packages": [ { "name": "guzzlehttp/guzzle", @@ -1211,6 +1211,59 @@ }, "time": "2022-02-21T01:04:05+00:00" }, + { + "name": "phpstan/phpstan", + "version": "2.1.30", + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/phpstan/phpstan/zipball/a4a7f159927983dd4f7c8020ed227d80b7f39d7d", + "reference": "a4a7f159927983dd4f7c8020ed227d80b7f39d7d", + "shasum": "" + }, + "require": { + "php": "^7.4|^8.0" + }, + "conflict": { + "phpstan/phpstan-shim": "*" + }, + "bin": [ + "phpstan", + "phpstan.phar" + ], + "type": "library", + "autoload": { + "files": [ + "bootstrap.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "description": "PHPStan - PHP Static Analysis Tool", + "keywords": [ + "dev", + "static analysis" + ], + "support": { + "docs": "https://phpstan.org/user-guide/getting-started", + "forum": "https://github.com/phpstan/phpstan/discussions", + "issues": "https://github.com/phpstan/phpstan/issues", + "security": "https://github.com/phpstan/phpstan/security/policy", + "source": "https://github.com/phpstan/phpstan-src" + }, + "funding": [ + { + "url": "https://github.com/ondrejmirtes", + "type": "github" + }, + { + "url": "https://github.com/phpstan", + "type": "github" + } + ], + "time": "2025-10-02T16:07:52+00:00" + }, { "name": "phpunit/php-code-coverage", "version": "11.0.11", @@ -2641,6 +2694,85 @@ ], "time": "2024-10-09T05:16:32+00:00" }, + { + "name": "squizlabs/php_codesniffer", + "version": "4.0.0", + "source": { + "type": "git", + "url": "https://github.com/PHPCSStandards/PHP_CodeSniffer.git", + "reference": "06113cfdaf117fc2165f9cd040bd0f17fcd5242d" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/PHPCSStandards/PHP_CodeSniffer/zipball/06113cfdaf117fc2165f9cd040bd0f17fcd5242d", + "reference": "06113cfdaf117fc2165f9cd040bd0f17fcd5242d", + "shasum": "" + }, + "require": { + "ext-simplexml": "*", + "ext-tokenizer": "*", + "ext-xmlwriter": "*", + "php": ">=7.2.0" + }, + "require-dev": { + "phpunit/phpunit": "^8.4.0 || ^9.3.4 || ^10.5.32 || 11.3.3 - 11.5.28 || ^11.5.31" + }, + "bin": [ + "bin/phpcbf", + "bin/phpcs" + ], + "type": "library", + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Greg Sherwood", + "role": "Former lead" + }, + { + "name": "Juliette Reinders Folmer", + "role": "Current lead" + }, + { + "name": "Contributors", + "homepage": "https://github.com/PHPCSStandards/PHP_CodeSniffer/graphs/contributors" + } + ], + "description": "PHP_CodeSniffer tokenizes PHP files and detects violations of a defined set of coding standards.", + "homepage": "https://github.com/PHPCSStandards/PHP_CodeSniffer", + "keywords": [ + "phpcs", + "standards", + "static analysis" + ], + "support": { + "issues": "https://github.com/PHPCSStandards/PHP_CodeSniffer/issues", + "security": "https://github.com/PHPCSStandards/PHP_CodeSniffer/security/policy", + "source": "https://github.com/PHPCSStandards/PHP_CodeSniffer", + "wiki": "https://github.com/PHPCSStandards/PHP_CodeSniffer/wiki" + }, + "funding": [ + { + "url": "https://github.com/PHPCSStandards", + "type": "github" + }, + { + "url": "https://github.com/jrfnl", + "type": "github" + }, + { + "url": "https://opencollective.com/php_codesniffer", + "type": "open_collective" + }, + { + "url": "https://thanks.dev/u/gh/phpcsstandards", + "type": "thanks_dev" + } + ], + "time": "2025-09-15T11:28:58+00:00" + }, { "name": "staabm/side-effects-detector", "version": "1.0.5", diff --git a/tests/Integration/CrawlerIntegrationTest.php b/tests/Integration/CrawlerIntegrationTest.php index af5d84e..72f33f1 100644 --- a/tests/Integration/CrawlerIntegrationTest.php +++ b/tests/Integration/CrawlerIntegrationTest.php @@ -18,7 +18,8 @@ class CrawlerIntegrationTest extends TestCase // Create a test job $stmt = $this->db->prepare("INSERT INTO crawl_jobs (domain, status) VALUES (?, 'pending')"); $stmt->execute(['https://httpbin.org']); - $this->testJobId = $this->db->lastInsertId(); + $lastId = $this->db->lastInsertId(); + $this->testJobId = is_numeric($lastId) ? (int)$lastId : 0; } protected function tearDown(): void diff --git a/tests/Unit/CrawlerTest.php b/tests/Unit/CrawlerTest.php index 3794a20..ae15dce 100644 --- a/tests/Unit/CrawlerTest.php +++ b/tests/Unit/CrawlerTest.php @@ -17,7 +17,8 @@ class CrawlerTest extends TestCase // Create a test job $stmt = $db->prepare("INSERT INTO crawl_jobs (domain, status) VALUES (?, 'pending')"); $stmt->execute(['https://example.com']); - $this->testJobId = $db->lastInsertId(); + $lastId = $db->lastInsertId(); + $this->testJobId = is_numeric($lastId) ? (int)$lastId : 0; } protected function tearDown(): void diff --git a/tests/Unit/DatabaseTest.php b/tests/Unit/DatabaseTest.php index adfcec8..a2d4bc5 100644 --- a/tests/Unit/DatabaseTest.php +++ b/tests/Unit/DatabaseTest.php @@ -42,6 +42,7 @@ class DatabaseTest extends TestCase { $db = Database::getInstance(); $stmt = $db->query('SELECT 1 as test'); + $this->assertNotFalse($stmt, 'Query failed'); $result = $stmt->fetch(); $this->assertEquals(['test' => 1], $result);