Sonstiges

2025-10-03 20:22:17 +02:00
parent a6e2a7733e
commit 4e868ca8e9
2 changed files with 361 additions and 290 deletions
--- a/index.php
+++ b/index.php
@@ -1,13 +1,11 @@
 <?php
 declare(strict_types=1);

-Error_reporting(E_ALL);
-ini_set('display_errors', 1);
+error_reporting(E_ALL);
+ini_set('display_errors', '1');

 require_once 'webanalyse.php';
-$wa = new webanalyse();
-$db = mysqli_connect("localhost", "root", "", "screaming_frog");
-
+$wa = new WebAnalyse();
+$db = mysqli_connect('localhost', 'root', '', 'screaming_frog');

 $wa->doCrawl(1);
-
--- a/webanalyse.php
+++ b/webanalyse.php
@@ -1,88 +1,53 @@
 <?php

+declare(strict_types=1);

 /**
- * Klasse uebernimmt das Crawlen von Websites und persistiert Metadaten in MySQL.
+ * Koordiniert Webseiten-Crawls und persistiert Antwortdaten in der Screaming Frog Datenbank.
 */
-class webanalyse
+class WebAnalyse
 {
-    /**
-     * @var mysqli|null Verbindung zur Screaming Frog Datenbank.
-     */
-    var $db;
+    private const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36';
+    private const CURL_TIMEOUT = 30;

    /**
-     * Initialisiert die Datenbankverbindung fuer die Crawl-Session.
+     * @var mysqli Verbindung zur Screaming Frog Datenbank.
     */
-    function __construct()
+    private mysqli $db;
+
+    public function __construct(?mysqli $connection = null)
    {
-        $this->db = mysqli_connect("localhost", "root", "", "screaming_frog");
+        $connection ??= mysqli_connect('localhost', 'root', '', 'screaming_frog');
+
+        if (!$connection instanceof mysqli) {
+            throw new RuntimeException('Verbindung zur Datenbank konnte nicht hergestellt werden: ' . mysqli_connect_error());
        }

+        $connection->set_charset('utf8mb4');
+        $this->db = $connection;
+    }

    /**
-     * Holt eine einzelne URL via cURL und liefert Response-Metadaten.
+     * Holt eine einzelne URL und gibt Response-Metadaten zurueck.
     *
     * @param string $url Zieladresse fuer den Abruf.
     * @return array<string,mixed> Antwortdaten oder ein "error"-Schluessel.
     */
-    function getWebsite($url)
+    public function getWebsite(string $url): array
    {
-        // cURL-Session initialisieren
-        $ch = curl_init();
+        $handle = $this->createCurlHandle($url);
+        $response = curl_exec($handle);

-        // cURL-Optionen setzen
-        curl_setopt($ch, CURLOPT_URL, $url);
-        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);  // Antwort als String zurückgeben
-        curl_setopt($ch, CURLOPT_HEADER, true);          // Header in der Antwort einschließen
-        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);  // Weiterleitungen folgen
-        curl_setopt($ch, CURLOPT_TIMEOUT, 30);           // Timeout nach 30 Sekunden
-        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // User Agent setzen
-        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // SSL-Zertifikat nicht prüfen (nur für Tests)
-
-        // Anfrage ausführen
-        $response = curl_exec($ch);
-
-        // Fehler überprüfen
-        if (curl_errno($ch)) {
-            $error = curl_error($ch);
-            curl_close($ch);
+        if ($response === false) {
+            $error = curl_error($handle);
+            curl_close($handle);
            return ['error' => $error];
        }

-        // Informationen abrufen
-        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
-        $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
-        $totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
-        $effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
+        $info = curl_getinfo($handle);
+        curl_close($handle);

-        // cURL-Session schließen
-        curl_close($ch);
-
-        // Header und Body trennen
-        $headers = substr($response, 0, $headerSize);
-        $body = substr($response, $headerSize);
-
-        // Header in Array umwandeln
-        $headerLines = explode("\r\n", trim($headers));
-        $parsedHeaders = [];
-
-        foreach ($headerLines as $line) {
-            if (strpos($line, ':') !== false) {
-                list($key, $value) = explode(':', $line, 2);
-                $parsedHeaders[trim($key)] = trim($value);
-            }
-        }
-
-        return [
-            'url' => $effectiveUrl,
-            'status_code' => $httpCode,
-            // 'headers_raw' => $headers,
-            'headers_parsed' => $parsedHeaders,
-            'body' => $body,
-            'response_time' => $totalTime,
-            'body_size' => strlen($body)
-        ];
+        return $this->buildResponsePayload($response, $info);
    }

    /**
@@ -91,232 +56,202 @@ class webanalyse
     * @param array<int,string> $urls Liste von Ziel-URLs.
     * @return array<string,array<string,mixed>> Antworten je URL.
     */
-    function getMultipleWebsites($urls)
+    public function getMultipleWebsites(array $urls): array
    {
+        if ($urls === []) {
+            return [];
+        }

        $results = [];
-        $curlHandles = [];
        $multiHandle = curl_multi_init();
+        $handles = [];

-        // Einzelne cURL-Handles für jede URL erstellen
        foreach ($urls as $url) {
-            $ch = curl_init();
-
-            // cURL-Optionen setzen (gleich wie bei getWebsite)
-            curl_setopt($ch, CURLOPT_URL, $url);
-            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
-            curl_setopt($ch, CURLOPT_HEADER, true);
-            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
-            curl_setopt($ch, CURLOPT_TIMEOUT, 30);
-            curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
-            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
-
-            // Handle zum Multi-Handle hinzufügen
-            curl_multi_add_handle($multiHandle, $ch);
-            $curlHandles[$url] = $ch;
+            $handle = $this->createCurlHandle($url);
+            $handles[$url] = $handle;
+            curl_multi_add_handle($multiHandle, $handle);
        }

-        // Alle Anfragen parallel ausführen
        $running = null;
        do {
-            curl_multi_exec($multiHandle, $running);
-            curl_multi_select($multiHandle);
-        } while ($running > 0);
+            $status = curl_multi_exec($multiHandle, $running);
+        } while ($status === CURLM_CALL_MULTI_PERFORM);

+        while ($running && $status === CURLM_OK) {
+            if (curl_multi_select($multiHandle, 1.0) === -1) {
+                usleep(100000);
+            }

-        // Ergebnisse verarbeiten
-        foreach ($urls as $url) {
-            $ch = $curlHandles[$url];
-            $response = curl_multi_getcontent($ch);
+            do {
+                $status = curl_multi_exec($multiHandle, $running);
+            } while ($status === CURLM_CALL_MULTI_PERFORM);
+        }

-            // Fehler überprüfen
-            if (curl_errno($ch)) {
-                $error = curl_error($ch);
-                $results[$url] = ['error' => $error];
+        foreach ($handles as $url => $handle) {
+            $response = curl_multi_getcontent($handle);
+
+            if ($response === false) {
+                $results[$url] = ['error' => curl_error($handle)];
            } else {
-                // Informationen abrufen
-                $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
-                $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
-                $totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
-                $effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
-
-                // Header und Body trennen
-                $headers = substr($response, 0, $headerSize);
-                $body = substr($response, $headerSize);
-
-                // Header in Array umwandeln
-                $headerLines = explode("\r\n", trim($headers));
-                $parsedHeaders = [];
-
-                foreach ($headerLines as $line) {
-                    if (strpos($line, ':') !== false) {
-                        list($key, $value) = explode(':', $line, 2);
-                        $parsedHeaders[trim($key)] = trim($value);
-                    }
+                $results[$url] = $this->buildResponsePayload($response, curl_getinfo($handle));
            }

-                $results[$url] = [
-                    'url' => $effectiveUrl,
-                    'status_code' => $httpCode,
-                    'headers_parsed' => $parsedHeaders,
-                    'body' => $body,
-                    'response_time' => $totalTime,
-                    'body_size' => strlen($body)
-                ];
+            curl_multi_remove_handle($multiHandle, $handle);
+            curl_close($handle);
        }

-            // Handle aus Multi-Handle entfernen und schließen
-            curl_multi_remove_handle($multiHandle, $ch);
-            curl_close($ch);
-        }
-
-        // Multi-Handle schließen
        curl_multi_close($multiHandle);

        return $results;
    }

-
-
-
    /**
     * Persistiert Response-Daten und stoesst die Analyse der gefundenen Links an.
     *
     * @param int $crawlID Identifier der Crawl-Session.
     * @param string $url Ursprung-URL, deren Antwort verarbeitet wird.
     * @param array<string,mixed> $data Ergebnis der HTTP-Abfrage.
-     * @return void
     */
-    function processResults(int $crawlID, string $url, array $data)
+    public function processResults(int $crawlID, string $url, array $data): void
    {
-        if (!isset($data['error'])) {
-            $status_code = $data['status_code'];
-            $response_time = $data['response_time'];
-            $body_size = $data['body_size'];
-            $date = date('Y-m-d H:i:s');
-            $body = $data['body'];
-
-            $sql = "UPDATE urls SET 
-            status_code = " . $status_code . ", 
-            response_time = " . ($response_time * 1000) . ", 
-            body_size = " . $body_size . ", 
-            date = now(),
-            body = '" . $this->db->real_escape_string($body) . "'
-
-            WHERE url = '" . $this->db->real_escape_string($url) . "' AND crawl_id = " . $crawlID . " LIMIT 1";
-            // echo $sql;
-
-            $this->db->query($sql);
-        } else {
-            // Handle error case if needed
-            echo "Fehler bei der Analyse von $url: " . $data['error'] . "\n";
+        if (isset($data['error'])) {
+            error_log(sprintf('Fehler bei der Analyse von %s: %s', $url, $data['error']));
+            return;
        }

+        $body = (string)($data['body'] ?? '');
+
+        $update = $this->db->prepare(
+            'UPDATE urls
+             SET status_code = ?, response_time = ?, body_size = ?, date = NOW(), body = ?
+             WHERE url = ? AND crawl_id = ?
+             LIMIT 1'
+        );
+
+        if ($update === false) {
+            throw new RuntimeException('Update-Statement konnte nicht vorbereitet werden: ' . $this->db->error);
+        }
+
+        $statusCode = (int)($data['status_code'] ?? 0);
+        $responseTimeMs = (int)round(((float)($data['response_time'] ?? 0)) * 1000);
+        $bodySize = (int)($data['body_size'] ?? strlen($body));
+
+        $update->bind_param('iiissi', $statusCode, $responseTimeMs, $bodySize, $body, $url, $crawlID);
+        $update->execute();
+        $update->close();
+
        $this->findNewUrls($crawlID, $body, $url);
    }

-
    /**
     * Extrahiert Links aus einer Antwort und legt neue URL-Datensaetze an.
     *
     * @param int $crawlID Identifier der Crawl-Session.
     * @param string $body HTML-Koerper der Antwort.
     * @param string $url Bearbeitete URL, dient als Kontext fuer relative Links.
-     * @return void
     */
-    function findNewUrls(int $crawlID, string $body, string $url) {
-
-
-
+    public function findNewUrls(int $crawlID, string $body, string $url): void
+    {
+        if ($body === '') {
+            return;
+        }

        $links = $this->extractLinks($body, $url);
-
-        $temp = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($url)."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
-        $vonUrlId = $temp[0]['id'];
-
-
-        $this->db->query("delete from links where von = ".$vonUrlId);
-
-        foreach($links as $l) {
-
-            $u = $this->db->query("insert ignore into urls (url, crawl_id) values ('".$this->db->real_escape_string($l['absolute_url'])."',".$crawlID.")");
-            $id = $this->db->insert_id;
-            if ($id === 0) {
-                $qwer = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($l['absolute_url'])."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
-                $id = $qwer[0]['id'];
+        if ($links === []) {
+            return;
        }

-
-
-
-
-            $sql_links = "insert ignore into links (von, nach, linktext, dofollow) values (
-            ".$vonUrlId.",
-            ".$id.",
-
-
-            '".$this->db->real_escape_string(mb_convert_encoding($l['text'],"UTF-8"))."',
-            ".(strstr($l['rel']??"", 'nofollow') === false ? 1 : 0)."
-
-
-            )";
-
-            echo $sql_links;
-
-            $u = $this->db->query($sql_links);
-        
-            
-
+        $originId = $this->resolveUrlId($crawlID, $url);
+        if ($originId === null) {
+            return;
        }

-
-
-        print_r($links);
-
-
+        $deleteLinksStmt = $this->db->prepare('DELETE FROM links WHERE von = ?');
+        if ($deleteLinksStmt !== false) {
+            $deleteLinksStmt->bind_param('i', $originId);
+            $deleteLinksStmt->execute();
+            $deleteLinksStmt->close();
        }

+        $insertUrlStmt = $this->db->prepare('INSERT IGNORE INTO urls (url, crawl_id) VALUES (?, ?)');
+        $selectUrlStmt = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
+        $insertLinkStmt = $this->db->prepare('INSERT IGNORE INTO links (von, nach, linktext, dofollow) VALUES (?, ?, ?, ?)');
+
+        if (!$insertUrlStmt || !$selectUrlStmt || !$insertLinkStmt) {
+            throw new RuntimeException('Vorbereitete Statements konnten nicht erstellt werden: ' . $this->db->error);
+        }
+
+        foreach ($links as $link) {
+            $absoluteUrl = (string)$link['absolute_url'];
+
+            $insertUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
+            $insertUrlStmt->execute();
+
+            $targetId = $this->db->insert_id;
+            if ($targetId === 0) {
+                $selectUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
+                $selectUrlStmt->execute();
+                $result = $selectUrlStmt->get_result();
+                $targetId = $result ? (int)($result->fetch_assoc()['id'] ?? 0) : 0;
+            }
+
+            if ($targetId === 0) {
+                continue;
+            }
+
+            $linkText = $this->normaliseText((string)($link['text'] ?? ''));
+            $isFollow = (int)(strpos((string)($link['rel'] ?? ''), 'nofollow') !== false ? 0 : 1);
+
+            $insertLinkStmt->bind_param('iisi', $originId, $targetId, $linkText, $isFollow);
+            $insertLinkStmt->execute();
+        }
+
+        $insertUrlStmt->close();
+        $selectUrlStmt->close();
+        $insertLinkStmt->close();
+    }

    /**
     * Startet einen Crawl-Durchlauf fuer unbehandelte URLs.
     *
     * @param int $crawlID Identifier der Crawl-Session.
-     * @return void
     */
-    function doCrawl(int $crawlID)
+    public function doCrawl(int $crawlID): void
    {
+        $statement = $this->db->prepare(
+            'SELECT url FROM urls WHERE crawl_id = ? AND date IS NULL LIMIT 50'
+        );

-        $urls2toCrawl = $this->db->query("select * from urls where crawl_id = " . $crawlID . " and date is null LIMIT 2")->fetch_all(MYSQLI_ASSOC); // and date is not null
-
-
-        $urls = [];
-        foreach ($urls2toCrawl as $u) {
-            $urls[] = $u['url'];
+        if ($statement === false) {
+            return;
        }

-        $multipleResults = $this->getMultipleWebsites($urls);
+        $statement->bind_param('i', $crawlID);
+        $statement->execute();
+        $result = $statement->get_result();

-        // print_r($multipleResults);
-        foreach ($multipleResults as $url => $data) {
+        if (!$result instanceof mysqli_result) {
+            $statement->close();
+            return;
+        }

+        $urls = [];
+        while ($row = $result->fetch_assoc()) {
+            $urls[] = $row['url'];
+        }
+
+        $result->free();
+        $statement->close();
+
+        if ($urls === []) {
+            return;
+        }
+
+        foreach ($this->getMultipleWebsites($urls) as $url => $data) {
            $this->processResults($crawlID, $url, $data);
        }
    }

-
-
-
-
-
-
-
-
-
-
-
-
-
-
    /**
     * Parst HTML-Inhalt und liefert eine strukturierte Liste gefundener Links.
     *
@@ -324,50 +259,41 @@ class webanalyse
     * @param string $baseUrl Basis-URL fuer die Aufloesung relativer Pfade.
     * @return array<int,array<string,mixed>> Gesammelte Linkdaten.
     */
-    function extractLinks($html, $baseUrl = '')
+    public function extractLinks(string $html, string $baseUrl = ''): array
    {
        $links = [];

-        // DOMDocument erstellen und HTML laden
        $dom = new DOMDocument();
-
-        // Fehlerbehandlung für ungültiges HTML
-        libxml_use_internal_errors(true);
+        $previous = libxml_use_internal_errors(true);
        $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
        libxml_clear_errors();
+        libxml_use_internal_errors($previous);

-        // Alle <a> Tags finden
-        $aTags = $dom->getElementsByTagName('a');
+        foreach ($dom->getElementsByTagName('a') as $index => $aTag) {
+            $href = trim($aTag->getAttribute('href'));
+            if ($href === '') {
+                continue;
+            }

-        foreach ($aTags as $index => $aTag) {
-            $href = $aTag->getAttribute('href');
-            $text = trim($aTag->textContent);
+            $absoluteUrl = $this->resolveUrl($href, $baseUrl);
+            $text = $this->normaliseText(trim($aTag->textContent));
            $rel = $aTag->getAttribute('rel');
            $title = $aTag->getAttribute('title');
            $target = $aTag->getAttribute('target');

-            // Nur Links mit href-Attribut
-            if (!empty($href)) {
-                // Relative URLs zu absoluten URLs konvertieren
-                $absoluteUrl = $href;
-                if (!empty($baseUrl) && !preg_match('/^https?:\/\//', $href)) {
-                    $absoluteUrl = rtrim($baseUrl, '/') . '/' . ltrim($href, '/');
-                }
-
            $links[] = [
                'index' => $index + 1,
                'href' => $href,
                'absolute_url' => $absoluteUrl,
                'text' => $text,
-                    'rel' => $rel ?: null,
-                    'title' => $title ?: null,
-                    'target' => $target ?: null,
-                    'is_external' => $this->isExternalLink($href, $baseUrl),
+                'rel' => $rel !== '' ? $rel : null,
+                'title' => $title !== '' ? $title : null,
+                'target' => $target !== '' ? $target : null,
+                'is_external' => $this->isExternalLink($absoluteUrl, $baseUrl),
                'link_type' => $this->getLinkType($href),
-                    'is_internal' => $this->isInternalLink($href, $baseUrl)?1:0
+                'is_internal' => $this->isInternalLink($absoluteUrl, $baseUrl) ? 1 : 0,
            ];
        }
-        }

        return $links;
    }
@@ -379,19 +305,20 @@ class webanalyse
     * @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
     * @return bool|null True fuer extern, false fuer intern, null falls undefiniert.
     */
-    private function isExternalLink($href, $baseUrl)
+    private function isExternalLink(string $href, string $baseUrl): ?bool
    {
-        if (empty($baseUrl)) return null;
-
-        // Relative Links sind intern
-        if (!preg_match('/^https?:\/\//', $href)) {
-            return false;
+        if ($baseUrl === '') {
+            return null;
        }

        $baseDomain = parse_url($baseUrl, PHP_URL_HOST);
        $linkDomain = parse_url($href, PHP_URL_HOST);

-        return $baseDomain !== $linkDomain;
+        if ($baseDomain === null || $linkDomain === null) {
+            return null;
+        }
+
+        return !hash_equals($baseDomain, $linkDomain);
    }

    /**
@@ -401,19 +328,20 @@ class webanalyse
     * @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
     * @return bool|null True fuer intern, false fuer extern, null falls undefiniert.
     */
-    private function isInternalLink($href, $baseUrl)
+    private function isInternalLink(string $href, string $baseUrl): ?bool
    {
-        if (empty($baseUrl)) return null;
-
-        // Relative Links sind intern
-        if (!preg_match('/^https?:\/\//', $href)) {
-            return true;
+        if ($baseUrl === '') {
+            return null;
        }

        $baseDomain = parse_url($baseUrl, PHP_URL_HOST);
        $linkDomain = parse_url($href, PHP_URL_HOST);

-        return $baseDomain === $linkDomain;
+        if ($baseDomain === null || $linkDomain === null) {
+            return null;
+        }
+
+        return hash_equals($baseDomain, $linkDomain);
    }

    /**
@@ -422,17 +350,31 @@ class webanalyse
     * @param string $href Ziel des Links.
     * @return string Beschreibender Typ wie "absolute" oder "email".
     */
-    private function getLinkType($href)
+    private function getLinkType(string $href): string
    {
-        if (empty($href)) return 'empty';
-        if (strpos($href, 'mailto:') === 0) return 'email';
-        if (strpos($href, 'tel:') === 0) return 'phone';
-        if (strpos($href, '#') === 0) return 'anchor';
-        if (strpos($href, 'javascript:') === 0) return 'javascript';
-        if (preg_match('/^https?:\/\//', $href)) return 'absolute';
-        return 'relative';
+        if ($href === '') {
+            return 'empty';
        }

+        $lower = strtolower($href);
+        if (strpos($lower, 'mailto:') === 0) {
+            return 'email';
+        }
+        if (strpos($lower, 'tel:') === 0) {
+            return 'phone';
+        }
+        if (strpos($lower, '#') === 0) {
+            return 'anchor';
+        }
+        if (strpos($lower, 'javascript:') === 0) {
+            return 'javascript';
+        }
+        if (filter_var($href, FILTER_VALIDATE_URL)) {
+            return 'absolute';
+        }
+
+        return 'relative';
+    }

    /**
     * Gruppiert Links anhand ihres vorab bestimmten Typs.
@@ -440,18 +382,149 @@ class webanalyse
     * @param array<int,array<string,mixed>> $links Liste der extrahierten Links.
     * @return array<string,array<int,array<string,mixed>>> Links nach Typ gruppiert.
     */
-    function groupLinksByType($links)
+    public function groupLinksByType(array $links): array
    {
        $grouped = [];

        foreach ($links as $link) {
-            $type = $link['link_type'];
-            if (!isset($grouped[$type])) {
-                $grouped[$type] = [];
-            }
+            $type = (string)($link['link_type'] ?? 'unknown');
            $grouped[$type][] = $link;
        }

        return $grouped;
    }
+
+    /**
+     * Erstellt ein konfiguriertes Curl-Handle fuer einen Request.
+     *
+     * @return CurlHandle
+     */
+    private function createCurlHandle(string $url)
+    {
+        $handle = curl_init($url);
+        if ($handle === false) {
+            throw new RuntimeException('Konnte Curl-Handle nicht initialisieren: ' . $url);
+        }
+
+        curl_setopt_array($handle, [
+            CURLOPT_URL => $url,
+            CURLOPT_RETURNTRANSFER => true,
+            CURLOPT_HEADER => true,
+            CURLOPT_FOLLOWLOCATION => true,
+            CURLOPT_TIMEOUT => self::CURL_TIMEOUT,
+            CURLOPT_USERAGENT => self::USER_AGENT,
+            CURLOPT_SSL_VERIFYPEER => false,
+        ]);
+
+        return $handle;
+    }
+
+    /**
+     * Splittet Header und Body und bereitet das Antwort-Array auf.
+     *
+     * @param string $response Vollstaendige Response inkl. Header.
+     * @param array<string,mixed> $info curl_getinfo Ergebnis.
+     * @return array<string,mixed>
+     */
+    private function buildResponsePayload(string $response, array $info): array
+    {
+        $headerSize = (int)($info['header_size'] ?? 0);
+        $headers = substr($response, 0, $headerSize);
+        $body = substr($response, $headerSize);
+
+        return [
+            'url' => $info['url'] ?? ($info['redirect_url'] ?? ''),
+            'status_code' => (int)($info['http_code'] ?? 0),
+            'headers_parsed' => $this->parseHeaders($headers),
+            'body' => $body,
+            'response_time' => (float)($info['total_time'] ?? 0.0),
+            'body_size' => strlen($body),
+        ];
+    }
+
+    /**
+     * Wandelt Header-String in ein assoziatives Array um.
+     *
+     * @param string $headers Roh-Header.
+     * @return array<string,string>
+     */
+    private function parseHeaders(string $headers): array
+    {
+        $parsed = [];
+        foreach (preg_split('/\r?\n/', trim($headers)) as $line) {
+            if ($line === '' || strpos($line, ':') === false) {
+                continue;
+            }
+
+            [$key, $value] = explode(':', $line, 2);
+            $parsed[trim($key)] = trim($value);
+        }
+
+        return $parsed;
+    }
+
+    /**
+     * Normalisiert relativen Pfad gegenueber einer Basis-URL zu einer absoluten Adresse.
+     */
+    private function resolveUrl(string $href, string $baseUrl): string
+    {
+        if ($href === '' || filter_var($href, FILTER_VALIDATE_URL)) {
+            return $href;
+        }
+
+        if ($baseUrl === '') {
+            return $href;
+        }
+
+        $baseParts = parse_url($baseUrl);
+        if ($baseParts === false || !isset($baseParts['scheme'], $baseParts['host'])) {
+            return $href;
+        }
+
+        $scheme = $baseParts['scheme'];
+        $host = $baseParts['host'];
+        $port = isset($baseParts['port']) ? ':' . $baseParts['port'] : '';
+        $basePath = $baseParts['path'] ?? '/';
+
+        if (strpos($href, '/') === 0) {
+            $path = $href;
+        } else {
+            if (substr($basePath, -1) !== '/') {
+                $basePath = preg_replace('#/[^/]*$#', '/', $basePath) ?: '/';
+            }
+            $path = $basePath . $href;
+        }
+
+        return sprintf('%s://%s%s%s', $scheme, $host, $port, '/' . ltrim($path, '/'));
+    }
+
+    /**
+     * Sorgt fuer sauberen UTF-8 Text ohne Steuerzeichen.
+     */
+    private function normaliseText(string $text): string
+    {
+        $normalized = preg_replace('/\s+/u', ' ', $text) ?? '';
+        $encoding = mb_detect_encoding($normalized, ['UTF-8', 'ISO-8859-1', 'Windows-1252'], true) ?: 'UTF-8';
+
+        return trim(mb_convert_encoding($normalized, 'UTF-8', $encoding));
+    }
+
+    /**
+     * Ermittelt die ID einer URL innerhalb eines Crawl-Durchlaufs.
+     */
+    private function resolveUrlId(int $crawlID, string $url): ?int
+    {
+        $statement = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
+        if ($statement === false) {
+            return null;
+        }
+
+        $statement->bind_param('si', $url, $crawlID);
+        $statement->execute();
+        $result = $statement->get_result();
+        $id = $result ? $result->fetch_assoc()['id'] ?? null : null;
+        $statement->close();
+
+        return $id !== null ? (int)$id : null;
+    }
 }