From b3e8f2ce85ec8dfbebe5e0ce859299415e04d00b Mon Sep 17 00:00:00 2001 From: Martin Date: Fri, 26 Sep 2025 21:24:25 +0200 Subject: [PATCH] Start --- AGENTS.md | 19 +++ index.php | 13 ++ setnew.php | 11 ++ webanalyse.php | 391 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 434 insertions(+) create mode 100644 AGENTS.md create mode 100644 index.php create mode 100644 setnew.php create mode 100644 webanalyse.php diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..8b3c3af --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,19 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The codebase is intentionally lean. `index.php` bootstraps the crawl by instantiating `webanalyse` and handing off the crawl identifier. Core crawling logic lives in `webanalyse.php`, which houses HTTP fetching, link extraction, and database persistence. Use `setnew.php` to reset seed data inside the `screaming_frog` schema before a rerun. Keep new helpers in their own PHP files under this root so the autoload includes stay predictable; group SQL migrations or fixtures under a `database/` folder if you add them. IDE settings reside in `.idea/`. + +## Build, Test, and Development Commands +Run the project through Apache in XAMPP or start the PHP built-in server with `php -S localhost:8080 index.php` from this directory. Validate syntax quickly via `php -l webanalyse.php` (repeat for any new file). When iterating on crawl logic, truncate runtime tables with `php setnew.php` to restore the baseline dataset. + +## Coding Style & Naming Conventions +Follow PSR-12 style cues already in use: 4-space indentation, brace-on-new-line for functions, and `declare(strict_types=1);` at the top of entry scripts. Favour descriptive camelCase for methods (`getMultipleWebsites`) and snake_case only for direct SQL field names. Maintain `mysqli` usage for consistency, and gate new configuration through constants or clearly named environment variables. + +## Testing Guidelines +There is no automated suite yet; treat each crawl as an integration test. After code changes, run `php setnew.php` followed by a crawl and confirm that `crawl`, `urls`, and `links` tables reflect the expected row counts. Log anomalies with `error_log()` while developing, and remove or downgrade to structured responses before merging. + +## Commit & Pull Request Guidelines +Author commit messages in the present tense with a concise summary (`Add link grouping for external URLs`). Group related SQL adjustments with their PHP changes in the same commit. For pull requests, include: a short context paragraph, reproduction steps, screenshots of key output tables when behaviour changes, and any follow-up tasks. Link tracking tickets or issues so downstream agents can trace decisions. + +## Security & Configuration Notes +Database credentials are currently hard-coded for local XAMPP usage. If you introduce environment-based configuration, document expected `.env` keys and ensure credentials are excluded from version control. Never commit production connection details or raw crawl exports. diff --git a/index.php b/index.php new file mode 100644 index 0000000..deb262d --- /dev/null +++ b/index.php @@ -0,0 +1,13 @@ + doCrawl(1); + diff --git a/setnew.php b/setnew.php new file mode 100644 index 0000000..c5b508a --- /dev/null +++ b/setnew.php @@ -0,0 +1,11 @@ +query("truncate table crawl"); +// $db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/', 1)"); +$db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/leistungen/externer-ausbilder-fuer-fachinformatiker/', 1)"); + +$db->query("truncate table urls"); +$urls = $db->query("insert ignore into urls (id, url, crawl_id) select 1,start_url, id from crawl where id = 1"); #->fetch_all(MYSQLI_ASSOC) + +$db->query("truncate table links"); \ No newline at end of file diff --git a/webanalyse.php b/webanalyse.php new file mode 100644 index 0000000..dd63b00 --- /dev/null +++ b/webanalyse.php @@ -0,0 +1,391 @@ +db = mysqli_connect("localhost", "root", "", "screaming_frog"); + } + + + function getWebsite($url) + { + // cURL-Session initialisieren + $ch = curl_init(); + + // cURL-Optionen setzen + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // Antwort als String zurückgeben + curl_setopt($ch, CURLOPT_HEADER, true); // Header in der Antwort einschließen + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // Weiterleitungen folgen + curl_setopt($ch, CURLOPT_TIMEOUT, 30); // Timeout nach 30 Sekunden + curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // User Agent setzen + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // SSL-Zertifikat nicht prüfen (nur für Tests) + + // Anfrage ausführen + $response = curl_exec($ch); + + // Fehler überprüfen + if (curl_errno($ch)) { + $error = curl_error($ch); + curl_close($ch); + return ['error' => $error]; + } + + // Informationen abrufen + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); + $totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME); + $effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + + // cURL-Session schließen + curl_close($ch); + + // Header und Body trennen + $headers = substr($response, 0, $headerSize); + $body = substr($response, $headerSize); + + // Header in Array umwandeln + $headerLines = explode("\r\n", trim($headers)); + $parsedHeaders = []; + + foreach ($headerLines as $line) { + if (strpos($line, ':') !== false) { + list($key, $value) = explode(':', $line, 2); + $parsedHeaders[trim($key)] = trim($value); + } + } + + return [ + 'url' => $effectiveUrl, + 'status_code' => $httpCode, + // 'headers_raw' => $headers, + 'headers_parsed' => $parsedHeaders, + 'body' => $body, + 'response_time' => $totalTime, + 'body_size' => strlen($body) + ]; + } + + // Multi-cURL Funktion für mehrere URLs + function getMultipleWebsites($urls) + { + + $results = []; + $curlHandles = []; + $multiHandle = curl_multi_init(); + + // Einzelne cURL-Handles für jede URL erstellen + foreach ($urls as $url) { + $ch = curl_init(); + + // cURL-Optionen setzen (gleich wie bei getWebsite) + curl_setopt($ch, CURLOPT_URL, $url); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); + curl_setopt($ch, CURLOPT_HEADER, true); + curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); + curl_setopt($ch, CURLOPT_TIMEOUT, 30); + curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); + curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); + + // Handle zum Multi-Handle hinzufügen + curl_multi_add_handle($multiHandle, $ch); + $curlHandles[$url] = $ch; + } + + // Alle Anfragen parallel ausführen + $running = null; + do { + curl_multi_exec($multiHandle, $running); + curl_multi_select($multiHandle); + } while ($running > 0); + + + // Ergebnisse verarbeiten + foreach ($urls as $url) { + $ch = $curlHandles[$url]; + $response = curl_multi_getcontent($ch); + + // Fehler überprüfen + if (curl_errno($ch)) { + $error = curl_error($ch); + $results[$url] = ['error' => $error]; + } else { + // Informationen abrufen + $httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE); + $headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE); + $totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME); + $effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); + + // Header und Body trennen + $headers = substr($response, 0, $headerSize); + $body = substr($response, $headerSize); + + // Header in Array umwandeln + $headerLines = explode("\r\n", trim($headers)); + $parsedHeaders = []; + + foreach ($headerLines as $line) { + if (strpos($line, ':') !== false) { + list($key, $value) = explode(':', $line, 2); + $parsedHeaders[trim($key)] = trim($value); + } + } + + $results[$url] = [ + 'url' => $effectiveUrl, + 'status_code' => $httpCode, + 'headers_parsed' => $parsedHeaders, + 'body' => $body, + 'response_time' => $totalTime, + 'body_size' => strlen($body) + ]; + } + + // Handle aus Multi-Handle entfernen und schließen + curl_multi_remove_handle($multiHandle, $ch); + curl_close($ch); + } + + // Multi-Handle schließen + curl_multi_close($multiHandle); + + return $results; + } + + + + + function processResults(int $crawlID, string $url, array $data) + { + if (!isset($data['error'])) { + $status_code = $data['status_code']; + $response_time = $data['response_time']; + $body_size = $data['body_size']; + $date = date('Y-m-d H:i:s'); + $body = $data['body']; + + $sql = "UPDATE urls SET + status_code = " . $status_code . ", + response_time = " . ($response_time * 1000) . ", + body_size = " . $body_size . ", + date = now(), + body = '" . $this->db->real_escape_string($body) . "' + + WHERE url = '" . $this->db->real_escape_string($url) . "' AND crawl_id = " . $crawlID . " LIMIT 1"; + // echo $sql; + + $this->db->query($sql); + } else { + // Handle error case if needed + echo "Fehler bei der Analyse von $url: " . $data['error'] . "\n"; + } + + $this->findNewUrls($crawlID, $body, $url); + } + + + function findNewUrls(int $crawlID, string $body, string $url) { + + + + + $links = $this->extractLinks($body, $url); + + $temp = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($url)."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC); + $vonUrlId = $temp[0]['id']; + + + $this->db->query("delete from links where von = ".$vonUrlId); + + foreach($links as $l) { + + $u = $this->db->query("insert ignore into urls (url, crawl_id) values ('".$this->db->real_escape_string($l['absolute_url'])."',".$crawlID.")"); + $id = $this->db->insert_id; + if ($id === 0) { + $qwer = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($l['absolute_url'])."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC); + $id = $qwer[0]['id']; + } + + + + + + $sql_links = "insert ignore into links (von, nach, linktext, dofollow) values ( + ".$vonUrlId.", + ".$id.", + + + '".$this->db->real_escape_string(mb_convert_encoding($l['text'],"UTF-8"))."', + ".(strstr($l['rel']??"", 'nofollow') === false ? 1 : 0)." + + + )"; + + echo $sql_links; + + $u = $this->db->query($sql_links); + + + + } + + + + print_r($links); + + + } + + + function doCrawl(int $crawlID) + { + + $urls2toCrawl = $this->db->query("select * from urls where crawl_id = " . $crawlID . " and date is null LIMIT 2")->fetch_all(MYSQLI_ASSOC); // and date is not null + + + $urls = []; + foreach ($urls2toCrawl as $u) { + $urls[] = $u['url']; + } + + $multipleResults = $this->getMultipleWebsites($urls); + + // print_r($multipleResults); + foreach ($multipleResults as $url => $data) { + + $this->processResults($crawlID, $url, $data); + } + } + + + + + + + + + + + + + + + + function extractLinks($html, $baseUrl = '') + { + $links = []; + + // DOMDocument erstellen und HTML laden + $dom = new DOMDocument(); + + // Fehlerbehandlung für ungültiges HTML + libxml_use_internal_errors(true); + $dom->loadHTML('' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); + libxml_clear_errors(); + + // Alle Tags finden + $aTags = $dom->getElementsByTagName('a'); + + foreach ($aTags as $index => $aTag) { + $href = $aTag->getAttribute('href'); + $text = trim($aTag->textContent); + $rel = $aTag->getAttribute('rel'); + $title = $aTag->getAttribute('title'); + $target = $aTag->getAttribute('target'); + + // Nur Links mit href-Attribut + if (!empty($href)) { + // Relative URLs zu absoluten URLs konvertieren + $absoluteUrl = $href; + if (!empty($baseUrl) && !preg_match('/^https?:\/\//', $href)) { + $absoluteUrl = rtrim($baseUrl, '/') . '/' . ltrim($href, '/'); + } + + $links[] = [ + 'index' => $index + 1, + 'href' => $href, + 'absolute_url' => $absoluteUrl, + 'text' => $text, + 'rel' => $rel ?: null, + 'title' => $title ?: null, + 'target' => $target ?: null, + 'is_external' => $this->isExternalLink($href, $baseUrl), + 'link_type' => $this->getLinkType($href), + 'is_internal' => $this->isInternalLink($href, $baseUrl)?1:0 + ]; + } + } + + return $links; + } + + /** + * Prüft ob ein Link extern ist + */ + private function isExternalLink($href, $baseUrl) + { + if (empty($baseUrl)) return null; + + // Relative Links sind intern + if (!preg_match('/^https?:\/\//', $href)) { + return false; + } + + $baseDomain = parse_url($baseUrl, PHP_URL_HOST); + $linkDomain = parse_url($href, PHP_URL_HOST); + + return $baseDomain !== $linkDomain; + } + + private function isInternalLink($href, $baseUrl) + { + if (empty($baseUrl)) return null; + + // Relative Links sind intern + if (!preg_match('/^https?:\/\//', $href)) { + return true; + } + + $baseDomain = parse_url($baseUrl, PHP_URL_HOST); + $linkDomain = parse_url($href, PHP_URL_HOST); + + return $baseDomain === $linkDomain; + } + + /** + * Bestimmt den Typ des Links + */ + private function getLinkType($href) + { + if (empty($href)) return 'empty'; + if (strpos($href, 'mailto:') === 0) return 'email'; + if (strpos($href, 'tel:') === 0) return 'phone'; + if (strpos($href, '#') === 0) return 'anchor'; + if (strpos($href, 'javascript:') === 0) return 'javascript'; + if (preg_match('/^https?:\/\//', $href)) return 'absolute'; + return 'relative'; + } + + + /** + * Funktion zum Gruppieren der Links nach Typ + */ + function groupLinksByType($links) + { + $grouped = []; + + foreach ($links as $link) { + $type = $link['link_type']; + if (!isset($grouped[$type])) { + $grouped[$type] = []; + } + $grouped[$type][] = $link; + } + + return $grouped; + } +}