Start

2025-09-26 21:24:25 +02:00
commit b3e8f2ce85
4 changed files with 434 additions and 0 deletions
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,19 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+The codebase is intentionally lean. `index.php` bootstraps the crawl by instantiating `webanalyse` and handing off the crawl identifier. Core crawling logic lives in `webanalyse.php`, which houses HTTP fetching, link extraction, and database persistence. Use `setnew.php` to reset seed data inside the `screaming_frog` schema before a rerun. Keep new helpers in their own PHP files under this root so the autoload includes stay predictable; group SQL migrations or fixtures under a `database/` folder if you add them. IDE settings reside in `.idea/`.
+
+## Build, Test, and Development Commands
+Run the project through Apache in XAMPP or start the PHP built-in server with `php -S localhost:8080 index.php` from this directory. Validate syntax quickly via `php -l webanalyse.php` (repeat for any new file). When iterating on crawl logic, truncate runtime tables with `php setnew.php` to restore the baseline dataset.
+
+## Coding Style & Naming Conventions
+Follow PSR-12 style cues already in use: 4-space indentation, brace-on-new-line for functions, and `declare(strict_types=1);` at the top of entry scripts. Favour descriptive camelCase for methods (`getMultipleWebsites`) and snake_case only for direct SQL field names. Maintain `mysqli` usage for consistency, and gate new configuration through constants or clearly named environment variables.
+
+## Testing Guidelines
+There is no automated suite yet; treat each crawl as an integration test. After code changes, run `php setnew.php` followed by a crawl and confirm that `crawl`, `urls`, and `links` tables reflect the expected row counts. Log anomalies with `error_log()` while developing, and remove or downgrade to structured responses before merging.
+
+## Commit & Pull Request Guidelines
+Author commit messages in the present tense with a concise summary (`Add link grouping for external URLs`). Group related SQL adjustments with their PHP changes in the same commit. For pull requests, include: a short context paragraph, reproduction steps, screenshots of key output tables when behaviour changes, and any follow-up tasks. Link tracking tickets or issues so downstream agents can trace decisions.
+
+## Security & Configuration Notes
+Database credentials are currently hard-coded for local XAMPP usage. If you introduce environment-based configuration, document expected `.env` keys and ensure credentials are excluded from version control. Never commit production connection details or raw crawl exports.
--- a/index.php
+++ b/index.php
@@ -0,0 +1,13 @@
+<?php
+declare(strict_types=1);
+
+Error_reporting(E_ALL);
+ini_set('display_errors', 1);
+
+require_once 'webanalyse.php';
+$wa = new webanalyse();
+$db = mysqli_connect("localhost", "root", "", "screaming_frog");
+
+
+$wa-> doCrawl(1);
+
--- a/setnew.php
+++ b/setnew.php
@@ -0,0 +1,11 @@
+<?php
+$db = mysqli_connect("localhost", "root", "", "screaming_frog");
+
+$db->query("truncate table crawl");
+// $db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/', 1)");
+$db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/leistungen/externer-ausbilder-fuer-fachinformatiker/', 1)");
+
+$db->query("truncate table urls");
+$urls = $db->query("insert ignore into urls (id, url, crawl_id) select 1,start_url, id from crawl where id = 1"); #->fetch_all(MYSQLI_ASSOC)
+
+$db->query("truncate table links");
--- a/webanalyse.php
+++ b/webanalyse.php
@@ -0,0 +1,391 @@
+<?php
+
+
+class webanalyse
+{
+    var $db;
+
+    function __construct()
+    {
+        $this->db = mysqli_connect("localhost", "root", "", "screaming_frog");
+    }
+
+
+    function getWebsite($url)
+    {
+        // cURL-Session initialisieren
+        $ch = curl_init();
+
+        // cURL-Optionen setzen
+        curl_setopt($ch, CURLOPT_URL, $url);
+        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);  // Antwort als String zurückgeben
+        curl_setopt($ch, CURLOPT_HEADER, true);          // Header in der Antwort einschließen
+        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);  // Weiterleitungen folgen
+        curl_setopt($ch, CURLOPT_TIMEOUT, 30);           // Timeout nach 30 Sekunden
+        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // User Agent setzen
+        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // SSL-Zertifikat nicht prüfen (nur für Tests)
+
+        // Anfrage ausführen
+        $response = curl_exec($ch);
+
+        // Fehler überprüfen
+        if (curl_errno($ch)) {
+            $error = curl_error($ch);
+            curl_close($ch);
+            return ['error' => $error];
+        }
+
+        // Informationen abrufen
+        $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+        $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
+        $totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
+        $effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
+
+        // cURL-Session schließen
+        curl_close($ch);
+
+        // Header und Body trennen
+        $headers = substr($response, 0, $headerSize);
+        $body = substr($response, $headerSize);
+
+        // Header in Array umwandeln
+        $headerLines = explode("\r\n", trim($headers));
+        $parsedHeaders = [];
+
+        foreach ($headerLines as $line) {
+            if (strpos($line, ':') !== false) {
+                list($key, $value) = explode(':', $line, 2);
+                $parsedHeaders[trim($key)] = trim($value);
+            }
+        }
+
+        return [
+            'url' => $effectiveUrl,
+            'status_code' => $httpCode,
+            // 'headers_raw' => $headers,
+            'headers_parsed' => $parsedHeaders,
+            'body' => $body,
+            'response_time' => $totalTime,
+            'body_size' => strlen($body)
+        ];
+    }
+
+    // Multi-cURL Funktion für mehrere URLs
+    function getMultipleWebsites($urls)
+    {
+
+        $results = [];
+        $curlHandles = [];
+        $multiHandle = curl_multi_init();
+
+        // Einzelne cURL-Handles für jede URL erstellen
+        foreach ($urls as $url) {
+            $ch = curl_init();
+
+            // cURL-Optionen setzen (gleich wie bei getWebsite)
+            curl_setopt($ch, CURLOPT_URL, $url);
+            curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+            curl_setopt($ch, CURLOPT_HEADER, true);
+            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
+            curl_setopt($ch, CURLOPT_TIMEOUT, 30);
+            curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
+            curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
+
+            // Handle zum Multi-Handle hinzufügen
+            curl_multi_add_handle($multiHandle, $ch);
+            $curlHandles[$url] = $ch;
+        }
+
+        // Alle Anfragen parallel ausführen
+        $running = null;
+        do {
+            curl_multi_exec($multiHandle, $running);
+            curl_multi_select($multiHandle);
+        } while ($running > 0);
+
+
+        // Ergebnisse verarbeiten
+        foreach ($urls as $url) {
+            $ch = $curlHandles[$url];
+            $response = curl_multi_getcontent($ch);
+
+            // Fehler überprüfen
+            if (curl_errno($ch)) {
+                $error = curl_error($ch);
+                $results[$url] = ['error' => $error];
+            } else {
+                // Informationen abrufen
+                $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
+                $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
+                $totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
+                $effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
+
+                // Header und Body trennen
+                $headers = substr($response, 0, $headerSize);
+                $body = substr($response, $headerSize);
+
+                // Header in Array umwandeln
+                $headerLines = explode("\r\n", trim($headers));
+                $parsedHeaders = [];
+
+                foreach ($headerLines as $line) {
+                    if (strpos($line, ':') !== false) {
+                        list($key, $value) = explode(':', $line, 2);
+                        $parsedHeaders[trim($key)] = trim($value);
+                    }
+                }
+
+                $results[$url] = [
+                    'url' => $effectiveUrl,
+                    'status_code' => $httpCode,
+                    'headers_parsed' => $parsedHeaders,
+                    'body' => $body,
+                    'response_time' => $totalTime,
+                    'body_size' => strlen($body)
+                ];
+            }
+
+            // Handle aus Multi-Handle entfernen und schließen
+            curl_multi_remove_handle($multiHandle, $ch);
+            curl_close($ch);
+        }
+
+        // Multi-Handle schließen
+        curl_multi_close($multiHandle);
+
+        return $results;
+    }
+
+
+
+
+    function processResults(int $crawlID, string $url, array $data)
+    {
+        if (!isset($data['error'])) {
+            $status_code = $data['status_code'];
+            $response_time = $data['response_time'];
+            $body_size = $data['body_size'];
+            $date = date('Y-m-d H:i:s');
+            $body = $data['body'];
+
+            $sql = "UPDATE urls SET 
+            status_code = " . $status_code . ", 
+            response_time = " . ($response_time * 1000) . ", 
+            body_size = " . $body_size . ", 
+            date = now(),
+            body = '" . $this->db->real_escape_string($body) . "'
+
+            WHERE url = '" . $this->db->real_escape_string($url) . "' AND crawl_id = " . $crawlID . " LIMIT 1";
+            // echo $sql;
+
+            $this->db->query($sql);
+        } else {
+            // Handle error case if needed
+            echo "Fehler bei der Analyse von $url: " . $data['error'] . "\n";
+        }
+
+        $this->findNewUrls($crawlID, $body, $url);
+    }
+
+
+    function findNewUrls(int $crawlID, string $body, string $url) {
+
+
+
+
+        $links = $this->extractLinks($body, $url);
+
+        $temp = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($url)."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
+        $vonUrlId = $temp[0]['id'];
+
+
+        $this->db->query("delete from links where von = ".$vonUrlId);
+
+        foreach($links as $l) {
+
+            $u = $this->db->query("insert ignore into urls (url, crawl_id) values ('".$this->db->real_escape_string($l['absolute_url'])."',".$crawlID.")");
+            $id = $this->db->insert_id;
+            if ($id === 0) {
+                $qwer = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($l['absolute_url'])."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
+                $id = $qwer[0]['id'];
+            }
+
+
+
+
+
+            $sql_links = "insert ignore into links (von, nach, linktext, dofollow) values (
+            ".$vonUrlId.",
+            ".$id.",
+
+
+            '".$this->db->real_escape_string(mb_convert_encoding($l['text'],"UTF-8"))."',
+            ".(strstr($l['rel']??"", 'nofollow') === false ? 1 : 0)."
+
+
+            )";
+
+            echo $sql_links;
+
+            $u = $this->db->query($sql_links);
+        
+            
+
+        }
+
+
+
+        print_r($links);
+
+
+    }
+
+
+    function doCrawl(int $crawlID)
+    {
+
+        $urls2toCrawl = $this->db->query("select * from urls where crawl_id = " . $crawlID . " and date is null LIMIT 2")->fetch_all(MYSQLI_ASSOC); // and date is not null
+
+
+        $urls = [];
+        foreach ($urls2toCrawl as $u) {
+            $urls[] = $u['url'];
+        }
+
+        $multipleResults = $this->getMultipleWebsites($urls);
+
+        // print_r($multipleResults);
+        foreach ($multipleResults as $url => $data) {
+
+            $this->processResults($crawlID, $url, $data);
+        }
+    }
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+    function extractLinks($html, $baseUrl = '')
+    {
+        $links = [];
+
+        // DOMDocument erstellen und HTML laden
+        $dom = new DOMDocument();
+
+        // Fehlerbehandlung für ungültiges HTML
+        libxml_use_internal_errors(true);
+        $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
+        libxml_clear_errors();
+
+        // Alle <a> Tags finden
+        $aTags = $dom->getElementsByTagName('a');
+
+        foreach ($aTags as $index => $aTag) {
+            $href = $aTag->getAttribute('href');
+            $text = trim($aTag->textContent);
+            $rel = $aTag->getAttribute('rel');
+            $title = $aTag->getAttribute('title');
+            $target = $aTag->getAttribute('target');
+
+            // Nur Links mit href-Attribut
+            if (!empty($href)) {
+                // Relative URLs zu absoluten URLs konvertieren
+                $absoluteUrl = $href;
+                if (!empty($baseUrl) && !preg_match('/^https?:\/\//', $href)) {
+                    $absoluteUrl = rtrim($baseUrl, '/') . '/' . ltrim($href, '/');
+                }
+
+                $links[] = [
+                    'index' => $index + 1,
+                    'href' => $href,
+                    'absolute_url' => $absoluteUrl,
+                    'text' => $text,
+                    'rel' => $rel ?: null,
+                    'title' => $title ?: null,
+                    'target' => $target ?: null,
+                    'is_external' => $this->isExternalLink($href, $baseUrl),
+                    'link_type' => $this->getLinkType($href),
+                    'is_internal' => $this->isInternalLink($href, $baseUrl)?1:0
+                ];
+            }
+        }
+
+        return $links;
+    }
+
+    /**
+     * Prüft ob ein Link extern ist
+     */
+    private function isExternalLink($href, $baseUrl)
+    {
+        if (empty($baseUrl)) return null;
+
+        // Relative Links sind intern
+        if (!preg_match('/^https?:\/\//', $href)) {
+            return false;
+        }
+
+        $baseDomain = parse_url($baseUrl, PHP_URL_HOST);
+        $linkDomain = parse_url($href, PHP_URL_HOST);
+
+        return $baseDomain !== $linkDomain;
+    }
+
+    private function isInternalLink($href, $baseUrl)
+    {
+        if (empty($baseUrl)) return null;
+
+        // Relative Links sind intern
+        if (!preg_match('/^https?:\/\//', $href)) {
+            return true;
+        }
+
+        $baseDomain = parse_url($baseUrl, PHP_URL_HOST);
+        $linkDomain = parse_url($href, PHP_URL_HOST);
+
+        return $baseDomain === $linkDomain;
+    }
+
+    /**
+     * Bestimmt den Typ des Links
+     */
+    private function getLinkType($href)
+    {
+        if (empty($href)) return 'empty';
+        if (strpos($href, 'mailto:') === 0) return 'email';
+        if (strpos($href, 'tel:') === 0) return 'phone';
+        if (strpos($href, '#') === 0) return 'anchor';
+        if (strpos($href, 'javascript:') === 0) return 'javascript';
+        if (preg_match('/^https?:\/\//', $href)) return 'absolute';
+        return 'relative';
+    }
+
+
+    /**
+     * Funktion zum Gruppieren der Links nach Typ
+     */
+    function groupLinksByType($links)
+    {
+        $grouped = [];
+
+        foreach ($links as $link) {
+            $type = $link['link_type'];
+            if (!isset($grouped[$type])) {
+                $grouped[$type] = [];
+            }
+            $grouped[$type][] = $link;
+        }
+
+        return $grouped;
+    }
+}