Sonstiges

This commit is contained in:
2025-10-03 20:22:17 +02:00
parent a6e2a7733e
commit 4e868ca8e9
2 changed files with 361 additions and 290 deletions

View File

@@ -1,13 +1,11 @@
<?php
declare(strict_types=1);
Error_reporting(E_ALL);
ini_set('display_errors', 1);
error_reporting(E_ALL);
ini_set('display_errors', '1');
require_once 'webanalyse.php';
$wa = new webanalyse();
$db = mysqli_connect("localhost", "root", "", "screaming_frog");
$wa = new WebAnalyse();
$db = mysqli_connect('localhost', 'root', '', 'screaming_frog');
$wa->doCrawl(1);

View File

@@ -1,88 +1,53 @@
<?php
declare(strict_types=1);
/**
* Klasse uebernimmt das Crawlen von Websites und persistiert Metadaten in MySQL.
* Koordiniert Webseiten-Crawls und persistiert Antwortdaten in der Screaming Frog Datenbank.
*/
class webanalyse
class WebAnalyse
{
/**
* @var mysqli|null Verbindung zur Screaming Frog Datenbank.
*/
var $db;
private const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36';
private const CURL_TIMEOUT = 30;
/**
* Initialisiert die Datenbankverbindung fuer die Crawl-Session.
* @var mysqli Verbindung zur Screaming Frog Datenbank.
*/
function __construct()
private mysqli $db;
public function __construct(?mysqli $connection = null)
{
$this->db = mysqli_connect("localhost", "root", "", "screaming_frog");
$connection ??= mysqli_connect('localhost', 'root', '', 'screaming_frog');
if (!$connection instanceof mysqli) {
throw new RuntimeException('Verbindung zur Datenbank konnte nicht hergestellt werden: ' . mysqli_connect_error());
}
$connection->set_charset('utf8mb4');
$this->db = $connection;
}
/**
* Holt eine einzelne URL via cURL und liefert Response-Metadaten.
* Holt eine einzelne URL und gibt Response-Metadaten zurueck.
*
* @param string $url Zieladresse fuer den Abruf.
* @return array<string,mixed> Antwortdaten oder ein "error"-Schluessel.
*/
function getWebsite($url)
public function getWebsite(string $url): array
{
// cURL-Session initialisieren
$ch = curl_init();
$handle = $this->createCurlHandle($url);
$response = curl_exec($handle);
// cURL-Optionen setzen
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); // Antwort als String zurückgeben
curl_setopt($ch, CURLOPT_HEADER, true); // Header in der Antwort einschließen
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); // Weiterleitungen folgen
curl_setopt($ch, CURLOPT_TIMEOUT, 30); // Timeout nach 30 Sekunden
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'); // User Agent setzen
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); // SSL-Zertifikat nicht prüfen (nur für Tests)
// Anfrage ausführen
$response = curl_exec($ch);
// Fehler überprüfen
if (curl_errno($ch)) {
$error = curl_error($ch);
curl_close($ch);
if ($response === false) {
$error = curl_error($handle);
curl_close($handle);
return ['error' => $error];
}
// Informationen abrufen
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
$effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
$info = curl_getinfo($handle);
curl_close($handle);
// cURL-Session schließen
curl_close($ch);
// Header und Body trennen
$headers = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
// Header in Array umwandeln
$headerLines = explode("\r\n", trim($headers));
$parsedHeaders = [];
foreach ($headerLines as $line) {
if (strpos($line, ':') !== false) {
list($key, $value) = explode(':', $line, 2);
$parsedHeaders[trim($key)] = trim($value);
}
}
return [
'url' => $effectiveUrl,
'status_code' => $httpCode,
// 'headers_raw' => $headers,
'headers_parsed' => $parsedHeaders,
'body' => $body,
'response_time' => $totalTime,
'body_size' => strlen($body)
];
return $this->buildResponsePayload($response, $info);
}
/**
@@ -91,232 +56,202 @@ class webanalyse
* @param array<int,string> $urls Liste von Ziel-URLs.
* @return array<string,array<string,mixed>> Antworten je URL.
*/
function getMultipleWebsites($urls)
public function getMultipleWebsites(array $urls): array
{
if ($urls === []) {
return [];
}
$results = [];
$curlHandles = [];
$multiHandle = curl_multi_init();
$handles = [];
// Einzelne cURL-Handles für jede URL erstellen
foreach ($urls as $url) {
$ch = curl_init();
// cURL-Optionen setzen (gleich wie bei getWebsite)
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
// Handle zum Multi-Handle hinzufügen
curl_multi_add_handle($multiHandle, $ch);
$curlHandles[$url] = $ch;
$handle = $this->createCurlHandle($url);
$handles[$url] = $handle;
curl_multi_add_handle($multiHandle, $handle);
}
// Alle Anfragen parallel ausführen
$running = null;
do {
curl_multi_exec($multiHandle, $running);
curl_multi_select($multiHandle);
} while ($running > 0);
$status = curl_multi_exec($multiHandle, $running);
} while ($status === CURLM_CALL_MULTI_PERFORM);
while ($running && $status === CURLM_OK) {
if (curl_multi_select($multiHandle, 1.0) === -1) {
usleep(100000);
}
// Ergebnisse verarbeiten
foreach ($urls as $url) {
$ch = $curlHandles[$url];
$response = curl_multi_getcontent($ch);
do {
$status = curl_multi_exec($multiHandle, $running);
} while ($status === CURLM_CALL_MULTI_PERFORM);
}
// Fehler überprüfen
if (curl_errno($ch)) {
$error = curl_error($ch);
$results[$url] = ['error' => $error];
foreach ($handles as $url => $handle) {
$response = curl_multi_getcontent($handle);
if ($response === false) {
$results[$url] = ['error' => curl_error($handle)];
} else {
// Informationen abrufen
$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
$headerSize = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
$totalTime = curl_getinfo($ch, CURLINFO_TOTAL_TIME);
$effectiveUrl = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL);
// Header und Body trennen
$headers = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
// Header in Array umwandeln
$headerLines = explode("\r\n", trim($headers));
$parsedHeaders = [];
foreach ($headerLines as $line) {
if (strpos($line, ':') !== false) {
list($key, $value) = explode(':', $line, 2);
$parsedHeaders[trim($key)] = trim($value);
}
$results[$url] = $this->buildResponsePayload($response, curl_getinfo($handle));
}
$results[$url] = [
'url' => $effectiveUrl,
'status_code' => $httpCode,
'headers_parsed' => $parsedHeaders,
'body' => $body,
'response_time' => $totalTime,
'body_size' => strlen($body)
];
curl_multi_remove_handle($multiHandle, $handle);
curl_close($handle);
}
// Handle aus Multi-Handle entfernen und schließen
curl_multi_remove_handle($multiHandle, $ch);
curl_close($ch);
}
// Multi-Handle schließen
curl_multi_close($multiHandle);
return $results;
}
/**
* Persistiert Response-Daten und stoesst die Analyse der gefundenen Links an.
*
* @param int $crawlID Identifier der Crawl-Session.
* @param string $url Ursprung-URL, deren Antwort verarbeitet wird.
* @param array<string,mixed> $data Ergebnis der HTTP-Abfrage.
* @return void
*/
function processResults(int $crawlID, string $url, array $data)
public function processResults(int $crawlID, string $url, array $data): void
{
if (!isset($data['error'])) {
$status_code = $data['status_code'];
$response_time = $data['response_time'];
$body_size = $data['body_size'];
$date = date('Y-m-d H:i:s');
$body = $data['body'];
$sql = "UPDATE urls SET
status_code = " . $status_code . ",
response_time = " . ($response_time * 1000) . ",
body_size = " . $body_size . ",
date = now(),
body = '" . $this->db->real_escape_string($body) . "'
WHERE url = '" . $this->db->real_escape_string($url) . "' AND crawl_id = " . $crawlID . " LIMIT 1";
// echo $sql;
$this->db->query($sql);
} else {
// Handle error case if needed
echo "Fehler bei der Analyse von $url: " . $data['error'] . "\n";
if (isset($data['error'])) {
error_log(sprintf('Fehler bei der Analyse von %s: %s', $url, $data['error']));
return;
}
$body = (string)($data['body'] ?? '');
$update = $this->db->prepare(
'UPDATE urls
SET status_code = ?, response_time = ?, body_size = ?, date = NOW(), body = ?
WHERE url = ? AND crawl_id = ?
LIMIT 1'
);
if ($update === false) {
throw new RuntimeException('Update-Statement konnte nicht vorbereitet werden: ' . $this->db->error);
}
$statusCode = (int)($data['status_code'] ?? 0);
$responseTimeMs = (int)round(((float)($data['response_time'] ?? 0)) * 1000);
$bodySize = (int)($data['body_size'] ?? strlen($body));
$update->bind_param('iiissi', $statusCode, $responseTimeMs, $bodySize, $body, $url, $crawlID);
$update->execute();
$update->close();
$this->findNewUrls($crawlID, $body, $url);
}
/**
* Extrahiert Links aus einer Antwort und legt neue URL-Datensaetze an.
*
* @param int $crawlID Identifier der Crawl-Session.
* @param string $body HTML-Koerper der Antwort.
* @param string $url Bearbeitete URL, dient als Kontext fuer relative Links.
* @return void
*/
function findNewUrls(int $crawlID, string $body, string $url) {
public function findNewUrls(int $crawlID, string $body, string $url): void
{
if ($body === '') {
return;
}
$links = $this->extractLinks($body, $url);
$temp = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($url)."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
$vonUrlId = $temp[0]['id'];
$this->db->query("delete from links where von = ".$vonUrlId);
foreach($links as $l) {
$u = $this->db->query("insert ignore into urls (url, crawl_id) values ('".$this->db->real_escape_string($l['absolute_url'])."',".$crawlID.")");
$id = $this->db->insert_id;
if ($id === 0) {
$qwer = $this->db->query("select id from urls where url = '".$this->db->real_escape_string($l['absolute_url'])."' and crawl_id = ".$crawlID." LIMIT 1")->fetch_all(MYSQLI_ASSOC);
$id = $qwer[0]['id'];
if ($links === []) {
return;
}
$sql_links = "insert ignore into links (von, nach, linktext, dofollow) values (
".$vonUrlId.",
".$id.",
'".$this->db->real_escape_string(mb_convert_encoding($l['text'],"UTF-8"))."',
".(strstr($l['rel']??"", 'nofollow') === false ? 1 : 0)."
)";
echo $sql_links;
$u = $this->db->query($sql_links);
$originId = $this->resolveUrlId($crawlID, $url);
if ($originId === null) {
return;
}
print_r($links);
$deleteLinksStmt = $this->db->prepare('DELETE FROM links WHERE von = ?');
if ($deleteLinksStmt !== false) {
$deleteLinksStmt->bind_param('i', $originId);
$deleteLinksStmt->execute();
$deleteLinksStmt->close();
}
$insertUrlStmt = $this->db->prepare('INSERT IGNORE INTO urls (url, crawl_id) VALUES (?, ?)');
$selectUrlStmt = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
$insertLinkStmt = $this->db->prepare('INSERT IGNORE INTO links (von, nach, linktext, dofollow) VALUES (?, ?, ?, ?)');
if (!$insertUrlStmt || !$selectUrlStmt || !$insertLinkStmt) {
throw new RuntimeException('Vorbereitete Statements konnten nicht erstellt werden: ' . $this->db->error);
}
foreach ($links as $link) {
$absoluteUrl = (string)$link['absolute_url'];
$insertUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
$insertUrlStmt->execute();
$targetId = $this->db->insert_id;
if ($targetId === 0) {
$selectUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
$selectUrlStmt->execute();
$result = $selectUrlStmt->get_result();
$targetId = $result ? (int)($result->fetch_assoc()['id'] ?? 0) : 0;
}
if ($targetId === 0) {
continue;
}
$linkText = $this->normaliseText((string)($link['text'] ?? ''));
$isFollow = (int)(strpos((string)($link['rel'] ?? ''), 'nofollow') !== false ? 0 : 1);
$insertLinkStmt->bind_param('iisi', $originId, $targetId, $linkText, $isFollow);
$insertLinkStmt->execute();
}
$insertUrlStmt->close();
$selectUrlStmt->close();
$insertLinkStmt->close();
}
/**
* Startet einen Crawl-Durchlauf fuer unbehandelte URLs.
*
* @param int $crawlID Identifier der Crawl-Session.
* @return void
*/
function doCrawl(int $crawlID)
public function doCrawl(int $crawlID): void
{
$statement = $this->db->prepare(
'SELECT url FROM urls WHERE crawl_id = ? AND date IS NULL LIMIT 50'
);
$urls2toCrawl = $this->db->query("select * from urls where crawl_id = " . $crawlID . " and date is null LIMIT 2")->fetch_all(MYSQLI_ASSOC); // and date is not null
$urls = [];
foreach ($urls2toCrawl as $u) {
$urls[] = $u['url'];
if ($statement === false) {
return;
}
$multipleResults = $this->getMultipleWebsites($urls);
$statement->bind_param('i', $crawlID);
$statement->execute();
$result = $statement->get_result();
// print_r($multipleResults);
foreach ($multipleResults as $url => $data) {
if (!$result instanceof mysqli_result) {
$statement->close();
return;
}
$urls = [];
while ($row = $result->fetch_assoc()) {
$urls[] = $row['url'];
}
$result->free();
$statement->close();
if ($urls === []) {
return;
}
foreach ($this->getMultipleWebsites($urls) as $url => $data) {
$this->processResults($crawlID, $url, $data);
}
}
/**
* Parst HTML-Inhalt und liefert eine strukturierte Liste gefundener Links.
*
@@ -324,50 +259,41 @@ class webanalyse
* @param string $baseUrl Basis-URL fuer die Aufloesung relativer Pfade.
* @return array<int,array<string,mixed>> Gesammelte Linkdaten.
*/
function extractLinks($html, $baseUrl = '')
public function extractLinks(string $html, string $baseUrl = ''): array
{
$links = [];
// DOMDocument erstellen und HTML laden
$dom = new DOMDocument();
// Fehlerbehandlung für ungültiges HTML
libxml_use_internal_errors(true);
$previous = libxml_use_internal_errors(true);
$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
libxml_clear_errors();
libxml_use_internal_errors($previous);
// Alle <a> Tags finden
$aTags = $dom->getElementsByTagName('a');
foreach ($dom->getElementsByTagName('a') as $index => $aTag) {
$href = trim($aTag->getAttribute('href'));
if ($href === '') {
continue;
}
foreach ($aTags as $index => $aTag) {
$href = $aTag->getAttribute('href');
$text = trim($aTag->textContent);
$absoluteUrl = $this->resolveUrl($href, $baseUrl);
$text = $this->normaliseText(trim($aTag->textContent));
$rel = $aTag->getAttribute('rel');
$title = $aTag->getAttribute('title');
$target = $aTag->getAttribute('target');
// Nur Links mit href-Attribut
if (!empty($href)) {
// Relative URLs zu absoluten URLs konvertieren
$absoluteUrl = $href;
if (!empty($baseUrl) && !preg_match('/^https?:\/\//', $href)) {
$absoluteUrl = rtrim($baseUrl, '/') . '/' . ltrim($href, '/');
}
$links[] = [
'index' => $index + 1,
'href' => $href,
'absolute_url' => $absoluteUrl,
'text' => $text,
'rel' => $rel ?: null,
'title' => $title ?: null,
'target' => $target ?: null,
'is_external' => $this->isExternalLink($href, $baseUrl),
'rel' => $rel !== '' ? $rel : null,
'title' => $title !== '' ? $title : null,
'target' => $target !== '' ? $target : null,
'is_external' => $this->isExternalLink($absoluteUrl, $baseUrl),
'link_type' => $this->getLinkType($href),
'is_internal' => $this->isInternalLink($href, $baseUrl)?1:0
'is_internal' => $this->isInternalLink($absoluteUrl, $baseUrl) ? 1 : 0,
];
}
}
return $links;
}
@@ -379,19 +305,20 @@ class webanalyse
* @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
* @return bool|null True fuer extern, false fuer intern, null falls undefiniert.
*/
private function isExternalLink($href, $baseUrl)
private function isExternalLink(string $href, string $baseUrl): ?bool
{
if (empty($baseUrl)) return null;
// Relative Links sind intern
if (!preg_match('/^https?:\/\//', $href)) {
return false;
if ($baseUrl === '') {
return null;
}
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
$linkDomain = parse_url($href, PHP_URL_HOST);
return $baseDomain !== $linkDomain;
if ($baseDomain === null || $linkDomain === null) {
return null;
}
return !hash_equals($baseDomain, $linkDomain);
}
/**
@@ -401,19 +328,20 @@ class webanalyse
* @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
* @return bool|null True fuer intern, false fuer extern, null falls undefiniert.
*/
private function isInternalLink($href, $baseUrl)
private function isInternalLink(string $href, string $baseUrl): ?bool
{
if (empty($baseUrl)) return null;
// Relative Links sind intern
if (!preg_match('/^https?:\/\//', $href)) {
return true;
if ($baseUrl === '') {
return null;
}
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
$linkDomain = parse_url($href, PHP_URL_HOST);
return $baseDomain === $linkDomain;
if ($baseDomain === null || $linkDomain === null) {
return null;
}
return hash_equals($baseDomain, $linkDomain);
}
/**
@@ -422,17 +350,31 @@ class webanalyse
* @param string $href Ziel des Links.
* @return string Beschreibender Typ wie "absolute" oder "email".
*/
private function getLinkType($href)
private function getLinkType(string $href): string
{
if (empty($href)) return 'empty';
if (strpos($href, 'mailto:') === 0) return 'email';
if (strpos($href, 'tel:') === 0) return 'phone';
if (strpos($href, '#') === 0) return 'anchor';
if (strpos($href, 'javascript:') === 0) return 'javascript';
if (preg_match('/^https?:\/\//', $href)) return 'absolute';
return 'relative';
if ($href === '') {
return 'empty';
}
$lower = strtolower($href);
if (strpos($lower, 'mailto:') === 0) {
return 'email';
}
if (strpos($lower, 'tel:') === 0) {
return 'phone';
}
if (strpos($lower, '#') === 0) {
return 'anchor';
}
if (strpos($lower, 'javascript:') === 0) {
return 'javascript';
}
if (filter_var($href, FILTER_VALIDATE_URL)) {
return 'absolute';
}
return 'relative';
}
/**
* Gruppiert Links anhand ihres vorab bestimmten Typs.
@@ -440,18 +382,149 @@ class webanalyse
* @param array<int,array<string,mixed>> $links Liste der extrahierten Links.
* @return array<string,array<int,array<string,mixed>>> Links nach Typ gruppiert.
*/
function groupLinksByType($links)
public function groupLinksByType(array $links): array
{
$grouped = [];
foreach ($links as $link) {
$type = $link['link_type'];
if (!isset($grouped[$type])) {
$grouped[$type] = [];
}
$type = (string)($link['link_type'] ?? 'unknown');
$grouped[$type][] = $link;
}
return $grouped;
}
/**
* Erstellt ein konfiguriertes Curl-Handle fuer einen Request.
*
* @return CurlHandle
*/
private function createCurlHandle(string $url)
{
$handle = curl_init($url);
if ($handle === false) {
throw new RuntimeException('Konnte Curl-Handle nicht initialisieren: ' . $url);
}
curl_setopt_array($handle, [
CURLOPT_URL => $url,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HEADER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_TIMEOUT => self::CURL_TIMEOUT,
CURLOPT_USERAGENT => self::USER_AGENT,
CURLOPT_SSL_VERIFYPEER => false,
]);
return $handle;
}
/**
* Splittet Header und Body und bereitet das Antwort-Array auf.
*
* @param string $response Vollstaendige Response inkl. Header.
* @param array<string,mixed> $info curl_getinfo Ergebnis.
* @return array<string,mixed>
*/
private function buildResponsePayload(string $response, array $info): array
{
$headerSize = (int)($info['header_size'] ?? 0);
$headers = substr($response, 0, $headerSize);
$body = substr($response, $headerSize);
return [
'url' => $info['url'] ?? ($info['redirect_url'] ?? ''),
'status_code' => (int)($info['http_code'] ?? 0),
'headers_parsed' => $this->parseHeaders($headers),
'body' => $body,
'response_time' => (float)($info['total_time'] ?? 0.0),
'body_size' => strlen($body),
];
}
/**
* Wandelt Header-String in ein assoziatives Array um.
*
* @param string $headers Roh-Header.
* @return array<string,string>
*/
private function parseHeaders(string $headers): array
{
$parsed = [];
foreach (preg_split('/\r?\n/', trim($headers)) as $line) {
if ($line === '' || strpos($line, ':') === false) {
continue;
}
[$key, $value] = explode(':', $line, 2);
$parsed[trim($key)] = trim($value);
}
return $parsed;
}
/**
* Normalisiert relativen Pfad gegenueber einer Basis-URL zu einer absoluten Adresse.
*/
private function resolveUrl(string $href, string $baseUrl): string
{
if ($href === '' || filter_var($href, FILTER_VALIDATE_URL)) {
return $href;
}
if ($baseUrl === '') {
return $href;
}
$baseParts = parse_url($baseUrl);
if ($baseParts === false || !isset($baseParts['scheme'], $baseParts['host'])) {
return $href;
}
$scheme = $baseParts['scheme'];
$host = $baseParts['host'];
$port = isset($baseParts['port']) ? ':' . $baseParts['port'] : '';
$basePath = $baseParts['path'] ?? '/';
if (strpos($href, '/') === 0) {
$path = $href;
} else {
if (substr($basePath, -1) !== '/') {
$basePath = preg_replace('#/[^/]*$#', '/', $basePath) ?: '/';
}
$path = $basePath . $href;
}
return sprintf('%s://%s%s%s', $scheme, $host, $port, '/' . ltrim($path, '/'));
}
/**
* Sorgt fuer sauberen UTF-8 Text ohne Steuerzeichen.
*/
private function normaliseText(string $text): string
{
$normalized = preg_replace('/\s+/u', ' ', $text) ?? '';
$encoding = mb_detect_encoding($normalized, ['UTF-8', 'ISO-8859-1', 'Windows-1252'], true) ?: 'UTF-8';
return trim(mb_convert_encoding($normalized, 'UTF-8', $encoding));
}
/**
* Ermittelt die ID einer URL innerhalb eines Crawl-Durchlaufs.
*/
private function resolveUrlId(int $crawlID, string $url): ?int
{
$statement = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
if ($statement === false) {
return null;
}
$statement->bind_param('si', $url, $crawlID);
$statement->execute();
$result = $statement->get_result();
$id = $result ? $result->fetch_assoc()['id'] ?? null : null;
$statement->close();
return $id !== null ? (int)$id : null;
}
}