Compare commits

...

6 Commits

Author SHA1 Message Date
4e868ca8e9 Sonstiges 2025-10-03 20:22:17 +02:00
a6e2a7733e Fix Docker container startup and API endpoint configuration
- Update Dockerfile to use inline CMD instead of external start.sh script to resolve execution issues with CRLF line endings
- Fix nginx fastcgi_pass configuration to use localhost:9000 for PHP-FPM communication
- Correct API endpoint paths in frontend from /src/api.php to /api.php to match nginx document root configuration
- Ensure Composer dependencies are properly installed with PHP 8.3 compatibility

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-03 20:20:20 +02:00
67390a76f3 Merge conflict resolved in .gitignore 2025-10-03 19:57:58 +02:00
d9d73eee41 Start 2025-09-26 21:30:27 +02:00
5f6f179518 Start 2025-09-26 21:24:36 +02:00
b3e8f2ce85 Start 2025-09-26 21:24:25 +02:00
7 changed files with 575 additions and 7 deletions

2
.gitignore vendored
View File

@@ -22,4 +22,4 @@ Thumbs.db
*.cache
# Docker
docker-compose.override.yml
docker-compose.override.yml

19
AGENTS.md Normal file
View File

@@ -0,0 +1,19 @@
# Repository Guidelines
## Project Structure & Module Organization
The codebase is intentionally lean. `index.php` bootstraps the crawl by instantiating `webanalyse` and handing off the crawl identifier. Core crawling logic lives in `webanalyse.php`, which houses HTTP fetching, link extraction, and database persistence. Use `setnew.php` to reset seed data inside the `screaming_frog` schema before a rerun. Keep new helpers in their own PHP files under this root so the autoload includes stay predictable; group SQL migrations or fixtures under a `database/` folder if you add them. IDE settings reside in `.idea/`.
## Build, Test, and Development Commands
Run the project through Apache in XAMPP or start the PHP built-in server with `php -S localhost:8080 index.php` from this directory. Validate syntax quickly via `php -l webanalyse.php` (repeat for any new file). When iterating on crawl logic, truncate runtime tables with `php setnew.php` to restore the baseline dataset.
## Coding Style & Naming Conventions
Follow PSR-12 style cues already in use: 4-space indentation, brace-on-new-line for functions, and `declare(strict_types=1);` at the top of entry scripts. Favour descriptive camelCase for methods (`getMultipleWebsites`) and snake_case only for direct SQL field names. Maintain `mysqli` usage for consistency, and gate new configuration through constants or clearly named environment variables.
## Testing Guidelines
There is no automated suite yet; treat each crawl as an integration test. After code changes, run `php setnew.php` followed by a crawl and confirm that `crawl`, `urls`, and `links` tables reflect the expected row counts. Log anomalies with `error_log()` while developing, and remove or downgrade to structured responses before merging.
## Commit & Pull Request Guidelines
Author commit messages in the present tense with a concise summary (`Add link grouping for external URLs`). Group related SQL adjustments with their PHP changes in the same commit. For pull requests, include: a short context paragraph, reproduction steps, screenshots of key output tables when behaviour changes, and any follow-up tasks. Link tracking tickets or issues so downstream agents can trace decisions.
## Security & Configuration Notes
Database credentials are currently hard-coded for local XAMPP usage. If you introduce environment-based configuration, document expected `.env` keys and ensure credentials are excluded from version control. Never commit production connection details or raw crawl exports.

View File

@@ -38,8 +38,5 @@ RUN chown -R www-data:www-data /var/www/html \
# Expose port 80
EXPOSE 80
# Start script
COPY start.sh /start.sh
RUN chmod +x /start.sh
CMD ["/start.sh"]
# Start PHP-FPM and Nginx
CMD php-fpm -D && nginx -g 'daemon off;'

View File

@@ -14,7 +14,7 @@ server {
location ~ \.php$ {
try_files $uri =404;
fastcgi_split_path_info ^(.+\.php)(/.+)$;
fastcgi_pass 127.0.0.1:9000;
fastcgi_pass localhost:9000;
fastcgi_index index.php;
include fastcgi_params;
fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name;

11
index.php Normal file
View File

@@ -0,0 +1,11 @@
<?php
declare(strict_types=1);
error_reporting(E_ALL);
ini_set('display_errors', '1');
require_once 'webanalyse.php';
$wa = new WebAnalyse();
$db = mysqli_connect('localhost', 'root', '', 'screaming_frog');
$wa->doCrawl(1);

11
setnew.php Normal file
View File

@@ -0,0 +1,11 @@
<?php
$db = mysqli_connect("localhost", "root", "", "screaming_frog");
$db->query("truncate table crawl");
// $db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/', 1)");
$db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/leistungen/externer-ausbilder-fuer-fachinformatiker/', 1)");
$db->query("truncate table urls");
$urls = $db->query("insert ignore into urls (id, url, crawl_id) select 1,start_url, id from crawl where id = 1"); #->fetch_all(MYSQLI_ASSOC)
$db->query("truncate table links");

530
webanalyse.php Normal file
View File

@@ -0,0 +1,530 @@
<?php
declare(strict_types=1);
/**
* Koordiniert Webseiten-Crawls und persistiert Antwortdaten in der Screaming Frog Datenbank.
*/
class WebAnalyse
{
private const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36';
private const CURL_TIMEOUT = 30;
/**
* @var mysqli Verbindung zur Screaming Frog Datenbank.
*/
private mysqli $db;
public function __construct(?mysqli $connection = null)
{
$connection ??= mysqli_connect('localhost', 'root', '', 'screaming_frog');
if (!$connection instanceof mysqli) {
throw new RuntimeException('Verbindung zur Datenbank konnte nicht hergestellt werden: ' . mysqli_connect_error());
}
$connection->set_charset('utf8mb4');
$this->db = $connection;
}
/**
* Holt eine einzelne URL und gibt Response-Metadaten zurueck.
*
* @param string $url Zieladresse fuer den Abruf.
* @return array<string,mixed> Antwortdaten oder ein "error"-Schluessel.
*/
public function getWebsite(string $url): array
{
$handle = $this->createCurlHandle($url);
$response = curl_exec($handle);
if ($response === false) {
$error = curl_error($handle);
curl_close($handle);
return ['error' => $error];
}
$info = curl_getinfo($handle);
curl_close($handle);
return $this->buildResponsePayload($response, $info);
}
/**
* Ruft mehrere URLs parallel via curl_multi ab.
*
* @param array<int,string> $urls Liste von Ziel-URLs.
* @return array<string,array<string,mixed>> Antworten je URL.
*/
public function getMultipleWebsites(array $urls): array
{
if ($urls === []) {
return [];
}
$results = [];
$multiHandle = curl_multi_init();
$handles = [];
foreach ($urls as $url) {
$handle = $this->createCurlHandle($url);
$handles[$url] = $handle;
curl_multi_add_handle($multiHandle, $handle);
}
$running = null;
do {
$status = curl_multi_exec($multiHandle, $running);
} while ($status === CURLM_CALL_MULTI_PERFORM);
while ($running && $status === CURLM_OK) {
if (curl_multi_select($multiHandle, 1.0) === -1) {
usleep(100000);
}
do {
$status = curl_multi_exec($multiHandle, $running);
} while ($status === CURLM_CALL_MULTI_PERFORM);
}
foreach ($handles as $url => $handle) {
$response = curl_multi_getcontent($handle);
if ($response === false) {
$results[$url] = ['error' => curl_error($handle)];
} else {
$results[$url] = $this->buildResponsePayload($response, curl_getinfo($handle));
}
curl_multi_remove_handle($multiHandle, $handle);
curl_close($handle);
}
curl_multi_close($multiHandle);
return $results;
}
/**
* Persistiert Response-Daten und stoesst die Analyse der gefundenen Links an.
*
* @param int $crawlID Identifier der Crawl-Session.
* @param string $url Ursprung-URL, deren Antwort verarbeitet wird.
* @param array<string,mixed> $data Ergebnis der HTTP-Abfrage.
*/
public function processResults(int $crawlID, string $url, array $data): void
{
if (isset($data['error'])) {
error_log(sprintf('Fehler bei der Analyse von %s: %s', $url, $data['error']));
return;
}
$body = (string)($data['body'] ?? '');
$update = $this->db->prepare(
'UPDATE urls
SET status_code = ?, response_time = ?, body_size = ?, date = NOW(), body = ?
WHERE url = ? AND crawl_id = ?
LIMIT 1'
);
if ($update === false) {
throw new RuntimeException('Update-Statement konnte nicht vorbereitet werden: ' . $this->db->error);
}
$statusCode = (int)($data['status_code'] ?? 0);
$responseTimeMs = (int)round(((float)($data['response_time'] ?? 0)) * 1000);
$bodySize = (int)($data['body_size'] ?? strlen($body));
$update->bind_param('iiissi', $statusCode, $responseTimeMs, $bodySize, $body, $url, $crawlID);
$update->execute();
$update->close();
$this->findNewUrls($crawlID, $body, $url);
}
/**
* Extrahiert Links aus einer Antwort und legt neue URL-Datensaetze an.
*
* @param int $crawlID Identifier der Crawl-Session.
* @param string $body HTML-Koerper der Antwort.
* @param string $url Bearbeitete URL, dient als Kontext fuer relative Links.
*/
public function findNewUrls(int $crawlID, string $body, string $url): void
{
if ($body === '') {
return;
}
$links = $this->extractLinks($body, $url);
if ($links === []) {
return;
}
$originId = $this->resolveUrlId($crawlID, $url);
if ($originId === null) {
return;
}
$deleteLinksStmt = $this->db->prepare('DELETE FROM links WHERE von = ?');
if ($deleteLinksStmt !== false) {
$deleteLinksStmt->bind_param('i', $originId);
$deleteLinksStmt->execute();
$deleteLinksStmt->close();
}
$insertUrlStmt = $this->db->prepare('INSERT IGNORE INTO urls (url, crawl_id) VALUES (?, ?)');
$selectUrlStmt = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
$insertLinkStmt = $this->db->prepare('INSERT IGNORE INTO links (von, nach, linktext, dofollow) VALUES (?, ?, ?, ?)');
if (!$insertUrlStmt || !$selectUrlStmt || !$insertLinkStmt) {
throw new RuntimeException('Vorbereitete Statements konnten nicht erstellt werden: ' . $this->db->error);
}
foreach ($links as $link) {
$absoluteUrl = (string)$link['absolute_url'];
$insertUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
$insertUrlStmt->execute();
$targetId = $this->db->insert_id;
if ($targetId === 0) {
$selectUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
$selectUrlStmt->execute();
$result = $selectUrlStmt->get_result();
$targetId = $result ? (int)($result->fetch_assoc()['id'] ?? 0) : 0;
}
if ($targetId === 0) {
continue;
}
$linkText = $this->normaliseText((string)($link['text'] ?? ''));
$isFollow = (int)(strpos((string)($link['rel'] ?? ''), 'nofollow') !== false ? 0 : 1);
$insertLinkStmt->bind_param('iisi', $originId, $targetId, $linkText, $isFollow);
$insertLinkStmt->execute();
}
$insertUrlStmt->close();
$selectUrlStmt->close();
$insertLinkStmt->close();
}
/**
* Startet einen Crawl-Durchlauf fuer unbehandelte URLs.
*
* @param int $crawlID Identifier der Crawl-Session.
*/
public function doCrawl(int $crawlID): void
{
$statement = $this->db->prepare(
'SELECT url FROM urls WHERE crawl_id = ? AND date IS NULL LIMIT 50'
);
if ($statement === false) {
return;
}
$statement->bind_param('i', $crawlID);
$statement->execute();
$result = $statement->get_result();
if (!$result instanceof mysqli_result) {
$statement->close();
return;
}
$urls = [];
while ($row = $result->fetch_assoc()) {
$urls[] = $row['url'];
}
$result->free();
$statement->close();
if ($urls === []) {
return;
}
foreach ($this->getMultipleWebsites($urls) as $url => $data) {
$this->processResults($crawlID, $url, $data);
}
}
/**
* Parst HTML-Inhalt und liefert eine strukturierte Liste gefundener Links.
*
* @param string $html Rohes HTML-Dokument.
* @param string $baseUrl Basis-URL fuer die Aufloesung relativer Pfade.
* @return array<int,array<string,mixed>> Gesammelte Linkdaten.
*/
public function extractLinks(string $html, string $baseUrl = ''): array
{
$links = [];
$dom = new DOMDocument();
$previous = libxml_use_internal_errors(true);
$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
libxml_clear_errors();
libxml_use_internal_errors($previous);
foreach ($dom->getElementsByTagName('a') as $index => $aTag) {
$href = trim($aTag->getAttribute('href'));
if ($href === '') {
continue;
}
$absoluteUrl = $this->resolveUrl($href, $baseUrl);
$text = $this->normaliseText(trim($aTag->textContent));
$rel = $aTag->getAttribute('rel');
$title = $aTag->getAttribute('title');
$target = $aTag->getAttribute('target');
$links[] = [
'index' => $index + 1,
'href' => $href,
'absolute_url' => $absoluteUrl,
'text' => $text,
'rel' => $rel !== '' ? $rel : null,
'title' => $title !== '' ? $title : null,
'target' => $target !== '' ? $target : null,
'is_external' => $this->isExternalLink($absoluteUrl, $baseUrl),
'link_type' => $this->getLinkType($href),
'is_internal' => $this->isInternalLink($absoluteUrl, $baseUrl) ? 1 : 0,
];
}
return $links;
}
/**
* Prueft, ob ein Link aus Sicht der Basis-URL extern ist.
*
* @param string $href Ziel des Links.
* @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
* @return bool|null True fuer extern, false fuer intern, null falls undefiniert.
*/
private function isExternalLink(string $href, string $baseUrl): ?bool
{
if ($baseUrl === '') {
return null;
}
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
$linkDomain = parse_url($href, PHP_URL_HOST);
if ($baseDomain === null || $linkDomain === null) {
return null;
}
return !hash_equals($baseDomain, $linkDomain);
}
/**
* Prueft, ob ein Link derselben Domain wie die Basis-URL entspricht.
*
* @param string $href Ziel des Links.
* @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
* @return bool|null True fuer intern, false fuer extern, null falls undefiniert.
*/
private function isInternalLink(string $href, string $baseUrl): ?bool
{
if ($baseUrl === '') {
return null;
}
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
$linkDomain = parse_url($href, PHP_URL_HOST);
if ($baseDomain === null || $linkDomain === null) {
return null;
}
return hash_equals($baseDomain, $linkDomain);
}
/**
* Leitet den Link-Typ anhand gaengiger Protokolle und Muster ab.
*
* @param string $href Ziel des Links.
* @return string Beschreibender Typ wie "absolute" oder "email".
*/
private function getLinkType(string $href): string
{
if ($href === '') {
return 'empty';
}
$lower = strtolower($href);
if (strpos($lower, 'mailto:') === 0) {
return 'email';
}
if (strpos($lower, 'tel:') === 0) {
return 'phone';
}
if (strpos($lower, '#') === 0) {
return 'anchor';
}
if (strpos($lower, 'javascript:') === 0) {
return 'javascript';
}
if (filter_var($href, FILTER_VALIDATE_URL)) {
return 'absolute';
}
return 'relative';
}
/**
* Gruppiert Links anhand ihres vorab bestimmten Typs.
*
* @param array<int,array<string,mixed>> $links Liste der extrahierten Links.
* @return array<string,array<int,array<string,mixed>>> Links nach Typ gruppiert.
*/
public function groupLinksByType(array $links): array
{
$grouped = [];
foreach ($links as $link) {
$type = (string)($link['link_type'] ?? 'unknown');
$grouped[$type][] = $link;
}
return $grouped;
}
/**
* Erstellt ein konfiguriertes Curl-Handle fuer einen Request.
*
* @return CurlHandle
*/
private function createCurlHandle(string $url)
{
$handle = curl_init($url);
if ($handle === false) {
throw new RuntimeException('Konnte Curl-Handle nicht initialisieren: ' . $url);
}
curl_setopt_array($handle, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => self::CURL_TIMEOUT,
CURLOPT_USERAGENT => self::USER_AGENT,
CURLOPT_SSL_VERIFYPEER => false,
]);
return $handle;
}
/**
* Splittet Header und Body und bereitet das Antwort-Array auf.
*
* @param string $response Vollstaendige Response inkl. Header.
* @param array<string,mixed> $info curl_getinfo Ergebnis.
* @return array<string,mixed>
*/
private function buildResponsePayload(string $response, array $info): array
{
$headerSize = (int)($info['header_size'] ?? 0);
$headers = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
return [
'url' => $info['url'] ?? ($info['redirect_url'] ?? ''),
'status_code' => (int)($info['http_code'] ?? 0),
'headers_parsed' => $this->parseHeaders($headers),
'body' => $body,
'response_time' => (float)($info['total_time'] ?? 0.0),
'body_size' => strlen($body),
];
}
/**
* Wandelt Header-String in ein assoziatives Array um.
*
* @param string $headers Roh-Header.
* @return array<string,string>
*/
private function parseHeaders(string $headers): array
{
$parsed = [];
foreach (preg_split('/\r?\n/', trim($headers)) as $line) {
if ($line === '' || strpos($line, ':') === false) {
continue;
}
[$key, $value] = explode(':', $line, 2);
$parsed[trim($key)] = trim($value);
}
return $parsed;
}
/**
* Normalisiert relativen Pfad gegenueber einer Basis-URL zu einer absoluten Adresse.
*/
private function resolveUrl(string $href, string $baseUrl): string
{
if ($href === '' || filter_var($href, FILTER_VALIDATE_URL)) {
return $href;
}
if ($baseUrl === '') {
return $href;
}
$baseParts = parse_url($baseUrl);
if ($baseParts === false || !isset($baseParts['scheme'], $baseParts['host'])) {
return $href;
}
$scheme = $baseParts['scheme'];
$host = $baseParts['host'];
$port = isset($baseParts['port']) ? ':' . $baseParts['port'] : '';
$basePath = $baseParts['path'] ?? '/';
if (strpos($href, '/') === 0) {
$path = $href;
} else {
if (substr($basePath, -1) !== '/') {
$basePath = preg_replace('#/[^/]*$#', '/', $basePath) ?: '/';
}
$path = $basePath . $href;
}
return sprintf('%s://%s%s%s', $scheme, $host, $port, '/' . ltrim($path, '/'));
}
/**
* Sorgt fuer sauberen UTF-8 Text ohne Steuerzeichen.
*/
private function normaliseText(string $text): string
{
$normalized = preg_replace('/\s+/u', ' ', $text) ?? '';
$encoding = mb_detect_encoding($normalized, ['UTF-8', 'ISO-8859-1', 'Windows-1252'], true) ?: 'UTF-8';
return trim(mb_convert_encoding($normalized, 'UTF-8', $encoding));
}
/**
* Ermittelt die ID einer URL innerhalb eines Crawl-Durchlaufs.
*/
private function resolveUrlId(int $crawlID, string $url): ?int
{
$statement = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
if ($statement === false) {
return null;
}
$statement->bind_param('si', $url, $crawlID);
$statement->execute();
$result = $statement->get_result();
$id = $result ? $result->fetch_assoc()['id'] ?? null : null;
$statement->close();
return $id !== null ? (int)$id : null;
}
}