From daa76b2141b494781145c427b3c6c2b2ba4cffbd Mon Sep 17 00:00:00 2001 From: Martin Date: Sat, 4 Oct 2025 08:24:43 +0200 Subject: [PATCH] Remove legacy PHP files from root directory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removed unused legacy files: - index.php (old crawler entry point) - webanalyse.php (old crawler implementation) - setnew.php (database reset script) These files are no longer used. The current application uses: - src/index.php (web interface) - src/api.php (API endpoints) - src/classes/Crawler.php (crawler implementation) - src/crawler-worker.php (background worker) The legacy code remains in git history if needed. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- index.php | 11 - setnew.php | 11 - webanalyse.php | 530 ------------------------------------------------- 3 files changed, 552 deletions(-) delete mode 100644 index.php delete mode 100644 setnew.php delete mode 100644 webanalyse.php diff --git a/index.php b/index.php deleted file mode 100644 index 49d6b3f..0000000 --- a/index.php +++ /dev/null @@ -1,11 +0,0 @@ -doCrawl(1); diff --git a/setnew.php b/setnew.php deleted file mode 100644 index c5b508a..0000000 --- a/setnew.php +++ /dev/null @@ -1,11 +0,0 @@ -query("truncate table crawl"); -// $db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/', 1)"); -$db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/leistungen/externer-ausbilder-fuer-fachinformatiker/', 1)"); - -$db->query("truncate table urls"); -$urls = $db->query("insert ignore into urls (id, url, crawl_id) select 1,start_url, id from crawl where id = 1"); #->fetch_all(MYSQLI_ASSOC) - -$db->query("truncate table links"); \ No newline at end of file diff --git a/webanalyse.php b/webanalyse.php deleted file mode 100644 index b3be88d..0000000 --- a/webanalyse.php +++ /dev/null @@ -1,530 +0,0 @@ -set_charset('utf8mb4'); - $this->db = $connection; - } - - /** - * Holt eine einzelne URL und gibt Response-Metadaten zurueck. - * - * @param string $url Zieladresse fuer den Abruf. - * @return array Antwortdaten oder ein "error"-Schluessel. - */ - public function getWebsite(string $url): array - { - $handle = $this->createCurlHandle($url); - $response = curl_exec($handle); - - if ($response === false) { - $error = curl_error($handle); - curl_close($handle); - return ['error' => $error]; - } - - $info = curl_getinfo($handle); - curl_close($handle); - - return $this->buildResponsePayload($response, $info); - } - - /** - * Ruft mehrere URLs parallel via curl_multi ab. - * - * @param array $urls Liste von Ziel-URLs. - * @return array> Antworten je URL. - */ - public function getMultipleWebsites(array $urls): array - { - if ($urls === []) { - return []; - } - - $results = []; - $multiHandle = curl_multi_init(); - $handles = []; - - foreach ($urls as $url) { - $handle = $this->createCurlHandle($url); - $handles[$url] = $handle; - curl_multi_add_handle($multiHandle, $handle); - } - - $running = null; - do { - $status = curl_multi_exec($multiHandle, $running); - } while ($status === CURLM_CALL_MULTI_PERFORM); - - while ($running && $status === CURLM_OK) { - if (curl_multi_select($multiHandle, 1.0) === -1) { - usleep(100000); - } - - do { - $status = curl_multi_exec($multiHandle, $running); - } while ($status === CURLM_CALL_MULTI_PERFORM); - } - - foreach ($handles as $url => $handle) { - $response = curl_multi_getcontent($handle); - - if ($response === false) { - $results[$url] = ['error' => curl_error($handle)]; - } else { - $results[$url] = $this->buildResponsePayload($response, curl_getinfo($handle)); - } - - curl_multi_remove_handle($multiHandle, $handle); - curl_close($handle); - } - - curl_multi_close($multiHandle); - - return $results; - } - - /** - * Persistiert Response-Daten und stoesst die Analyse der gefundenen Links an. - * - * @param int $crawlID Identifier der Crawl-Session. - * @param string $url Ursprung-URL, deren Antwort verarbeitet wird. - * @param array $data Ergebnis der HTTP-Abfrage. - */ - public function processResults(int $crawlID, string $url, array $data): void - { - if (isset($data['error'])) { - error_log(sprintf('Fehler bei der Analyse von %s: %s', $url, $data['error'])); - return; - } - - $body = (string)($data['body'] ?? ''); - - $update = $this->db->prepare( - 'UPDATE urls - SET status_code = ?, response_time = ?, body_size = ?, date = NOW(), body = ? - WHERE url = ? AND crawl_id = ? - LIMIT 1' - ); - - if ($update === false) { - throw new RuntimeException('Update-Statement konnte nicht vorbereitet werden: ' . $this->db->error); - } - - $statusCode = (int)($data['status_code'] ?? 0); - $responseTimeMs = (int)round(((float)($data['response_time'] ?? 0)) * 1000); - $bodySize = (int)($data['body_size'] ?? strlen($body)); - - $update->bind_param('iiissi', $statusCode, $responseTimeMs, $bodySize, $body, $url, $crawlID); - $update->execute(); - $update->close(); - - $this->findNewUrls($crawlID, $body, $url); - } - - /** - * Extrahiert Links aus einer Antwort und legt neue URL-Datensaetze an. - * - * @param int $crawlID Identifier der Crawl-Session. - * @param string $body HTML-Koerper der Antwort. - * @param string $url Bearbeitete URL, dient als Kontext fuer relative Links. - */ - public function findNewUrls(int $crawlID, string $body, string $url): void - { - if ($body === '') { - return; - } - - $links = $this->extractLinks($body, $url); - if ($links === []) { - return; - } - - $originId = $this->resolveUrlId($crawlID, $url); - if ($originId === null) { - return; - } - - $deleteLinksStmt = $this->db->prepare('DELETE FROM links WHERE von = ?'); - if ($deleteLinksStmt !== false) { - $deleteLinksStmt->bind_param('i', $originId); - $deleteLinksStmt->execute(); - $deleteLinksStmt->close(); - } - - $insertUrlStmt = $this->db->prepare('INSERT IGNORE INTO urls (url, crawl_id) VALUES (?, ?)'); - $selectUrlStmt = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1'); - $insertLinkStmt = $this->db->prepare('INSERT IGNORE INTO links (von, nach, linktext, dofollow) VALUES (?, ?, ?, ?)'); - - if (!$insertUrlStmt || !$selectUrlStmt || !$insertLinkStmt) { - throw new RuntimeException('Vorbereitete Statements konnten nicht erstellt werden: ' . $this->db->error); - } - - foreach ($links as $link) { - $absoluteUrl = (string)$link['absolute_url']; - - $insertUrlStmt->bind_param('si', $absoluteUrl, $crawlID); - $insertUrlStmt->execute(); - - $targetId = $this->db->insert_id; - if ($targetId === 0) { - $selectUrlStmt->bind_param('si', $absoluteUrl, $crawlID); - $selectUrlStmt->execute(); - $result = $selectUrlStmt->get_result(); - $targetId = $result ? (int)($result->fetch_assoc()['id'] ?? 0) : 0; - } - - if ($targetId === 0) { - continue; - } - - $linkText = $this->normaliseText((string)($link['text'] ?? '')); - $isFollow = (int)(strpos((string)($link['rel'] ?? ''), 'nofollow') !== false ? 0 : 1); - - $insertLinkStmt->bind_param('iisi', $originId, $targetId, $linkText, $isFollow); - $insertLinkStmt->execute(); - } - - $insertUrlStmt->close(); - $selectUrlStmt->close(); - $insertLinkStmt->close(); - } - - /** - * Startet einen Crawl-Durchlauf fuer unbehandelte URLs. - * - * @param int $crawlID Identifier der Crawl-Session. - */ - public function doCrawl(int $crawlID): void - { - $statement = $this->db->prepare( - 'SELECT url FROM urls WHERE crawl_id = ? AND date IS NULL LIMIT 50' - ); - - if ($statement === false) { - return; - } - - $statement->bind_param('i', $crawlID); - $statement->execute(); - $result = $statement->get_result(); - - if (!$result instanceof mysqli_result) { - $statement->close(); - return; - } - - $urls = []; - while ($row = $result->fetch_assoc()) { - $urls[] = $row['url']; - } - - $result->free(); - $statement->close(); - - if ($urls === []) { - return; - } - - foreach ($this->getMultipleWebsites($urls) as $url => $data) { - $this->processResults($crawlID, $url, $data); - } - } - - /** - * Parst HTML-Inhalt und liefert eine strukturierte Liste gefundener Links. - * - * @param string $html Rohes HTML-Dokument. - * @param string $baseUrl Basis-URL fuer die Aufloesung relativer Pfade. - * @return array> Gesammelte Linkdaten. - */ - public function extractLinks(string $html, string $baseUrl = ''): array - { - $links = []; - - $dom = new DOMDocument(); - $previous = libxml_use_internal_errors(true); - $dom->loadHTML('' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); - libxml_clear_errors(); - libxml_use_internal_errors($previous); - - foreach ($dom->getElementsByTagName('a') as $index => $aTag) { - $href = trim($aTag->getAttribute('href')); - if ($href === '') { - continue; - } - - $absoluteUrl = $this->resolveUrl($href, $baseUrl); - $text = $this->normaliseText(trim($aTag->textContent)); - $rel = $aTag->getAttribute('rel'); - $title = $aTag->getAttribute('title'); - $target = $aTag->getAttribute('target'); - - $links[] = [ - 'index' => $index + 1, - 'href' => $href, - 'absolute_url' => $absoluteUrl, - 'text' => $text, - 'rel' => $rel !== '' ? $rel : null, - 'title' => $title !== '' ? $title : null, - 'target' => $target !== '' ? $target : null, - 'is_external' => $this->isExternalLink($absoluteUrl, $baseUrl), - 'link_type' => $this->getLinkType($href), - 'is_internal' => $this->isInternalLink($absoluteUrl, $baseUrl) ? 1 : 0, - ]; - } - - return $links; - } - - /** - * Prueft, ob ein Link aus Sicht der Basis-URL extern ist. - * - * @param string $href Ziel des Links. - * @param string $baseUrl Ausgangsadresse zur Domainabgleichung. - * @return bool|null True fuer extern, false fuer intern, null falls undefiniert. - */ - private function isExternalLink(string $href, string $baseUrl): ?bool - { - if ($baseUrl === '') { - return null; - } - - $baseDomain = parse_url($baseUrl, PHP_URL_HOST); - $linkDomain = parse_url($href, PHP_URL_HOST); - - if ($baseDomain === null || $linkDomain === null) { - return null; - } - - return !hash_equals($baseDomain, $linkDomain); - } - - /** - * Prueft, ob ein Link derselben Domain wie die Basis-URL entspricht. - * - * @param string $href Ziel des Links. - * @param string $baseUrl Ausgangsadresse zur Domainabgleichung. - * @return bool|null True fuer intern, false fuer extern, null falls undefiniert. - */ - private function isInternalLink(string $href, string $baseUrl): ?bool - { - if ($baseUrl === '') { - return null; - } - - $baseDomain = parse_url($baseUrl, PHP_URL_HOST); - $linkDomain = parse_url($href, PHP_URL_HOST); - - if ($baseDomain === null || $linkDomain === null) { - return null; - } - - return hash_equals($baseDomain, $linkDomain); - } - - /** - * Leitet den Link-Typ anhand gaengiger Protokolle und Muster ab. - * - * @param string $href Ziel des Links. - * @return string Beschreibender Typ wie "absolute" oder "email". - */ - private function getLinkType(string $href): string - { - if ($href === '') { - return 'empty'; - } - - $lower = strtolower($href); - if (strpos($lower, 'mailto:') === 0) { - return 'email'; - } - if (strpos($lower, 'tel:') === 0) { - return 'phone'; - } - if (strpos($lower, '#') === 0) { - return 'anchor'; - } - if (strpos($lower, 'javascript:') === 0) { - return 'javascript'; - } - if (filter_var($href, FILTER_VALIDATE_URL)) { - return 'absolute'; - } - - return 'relative'; - } - - /** - * Gruppiert Links anhand ihres vorab bestimmten Typs. - * - * @param array> $links Liste der extrahierten Links. - * @return array>> Links nach Typ gruppiert. - */ - public function groupLinksByType(array $links): array - { - $grouped = []; - - foreach ($links as $link) { - $type = (string)($link['link_type'] ?? 'unknown'); - $grouped[$type][] = $link; - } - - return $grouped; - } - - /** - * Erstellt ein konfiguriertes Curl-Handle fuer einen Request. - * - * @return CurlHandle - */ - private function createCurlHandle(string $url) - { - $handle = curl_init($url); - if ($handle === false) { - throw new RuntimeException('Konnte Curl-Handle nicht initialisieren: ' . $url); - } - - curl_setopt_array($handle, [ - CURLOPT_URL => $url, - CURLOPT_RETURNTRANSFER => true, - CURLOPT_HEADER => true, - CURLOPT_FOLLOWLOCATION => true, - CURLOPT_TIMEOUT => self::CURL_TIMEOUT, - CURLOPT_USERAGENT => self::USER_AGENT, - CURLOPT_SSL_VERIFYPEER => false, - ]); - - return $handle; - } - - /** - * Splittet Header und Body und bereitet das Antwort-Array auf. - * - * @param string $response Vollstaendige Response inkl. Header. - * @param array $info curl_getinfo Ergebnis. - * @return array - */ - private function buildResponsePayload(string $response, array $info): array - { - $headerSize = (int)($info['header_size'] ?? 0); - $headers = substr($response, 0, $headerSize); - $body = substr($response, $headerSize); - - return [ - 'url' => $info['url'] ?? ($info['redirect_url'] ?? ''), - 'status_code' => (int)($info['http_code'] ?? 0), - 'headers_parsed' => $this->parseHeaders($headers), - 'body' => $body, - 'response_time' => (float)($info['total_time'] ?? 0.0), - 'body_size' => strlen($body), - ]; - } - - /** - * Wandelt Header-String in ein assoziatives Array um. - * - * @param string $headers Roh-Header. - * @return array - */ - private function parseHeaders(string $headers): array - { - $parsed = []; - foreach (preg_split('/\r?\n/', trim($headers)) as $line) { - if ($line === '' || strpos($line, ':') === false) { - continue; - } - - [$key, $value] = explode(':', $line, 2); - $parsed[trim($key)] = trim($value); - } - - return $parsed; - } - - /** - * Normalisiert relativen Pfad gegenueber einer Basis-URL zu einer absoluten Adresse. - */ - private function resolveUrl(string $href, string $baseUrl): string - { - if ($href === '' || filter_var($href, FILTER_VALIDATE_URL)) { - return $href; - } - - if ($baseUrl === '') { - return $href; - } - - $baseParts = parse_url($baseUrl); - if ($baseParts === false || !isset($baseParts['scheme'], $baseParts['host'])) { - return $href; - } - - $scheme = $baseParts['scheme']; - $host = $baseParts['host']; - $port = isset($baseParts['port']) ? ':' . $baseParts['port'] : ''; - $basePath = $baseParts['path'] ?? '/'; - - if (strpos($href, '/') === 0) { - $path = $href; - } else { - if (substr($basePath, -1) !== '/') { - $basePath = preg_replace('#/[^/]*$#', '/', $basePath) ?: '/'; - } - $path = $basePath . $href; - } - - return sprintf('%s://%s%s%s', $scheme, $host, $port, '/' . ltrim($path, '/')); - } - - /** - * Sorgt fuer sauberen UTF-8 Text ohne Steuerzeichen. - */ - private function normaliseText(string $text): string - { - $normalized = preg_replace('/\s+/u', ' ', $text) ?? ''; - $encoding = mb_detect_encoding($normalized, ['UTF-8', 'ISO-8859-1', 'Windows-1252'], true) ?: 'UTF-8'; - - return trim(mb_convert_encoding($normalized, 'UTF-8', $encoding)); - } - - /** - * Ermittelt die ID einer URL innerhalb eines Crawl-Durchlaufs. - */ - private function resolveUrlId(int $crawlID, string $url): ?int - { - $statement = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1'); - if ($statement === false) { - return null; - } - - $statement->bind_param('si', $url, $crawlID); - $statement->execute(); - $result = $statement->get_result(); - $id = $result ? $result->fetch_assoc()['id'] ?? null : null; - $statement->close(); - - return $id !== null ? (int)$id : null; - } -}