Compare commits
6 Commits
f568875b2c
...
4e868ca8e9
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e868ca8e9 | |||
| a6e2a7733e | |||
| 67390a76f3 | |||
| d9d73eee41 | |||
| 5f6f179518 | |||
| b3e8f2ce85 |
2
.gitignore
vendored
2
.gitignore
vendored
@@ -22,4 +22,4 @@ Thumbs.db
|
||||
*.cache
|
||||
|
||||
# Docker
|
||||
docker-compose.override.yml
|
||||
docker-compose.override.yml
|
||||
19
AGENTS.md
Normal file
19
AGENTS.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# Repository Guidelines
|
||||
|
||||
## Project Structure & Module Organization
|
||||
The codebase is intentionally lean. `index.php` bootstraps the crawl by instantiating `webanalyse` and handing off the crawl identifier. Core crawling logic lives in `webanalyse.php`, which houses HTTP fetching, link extraction, and database persistence. Use `setnew.php` to reset seed data inside the `screaming_frog` schema before a rerun. Keep new helpers in their own PHP files under this root so the autoload includes stay predictable; group SQL migrations or fixtures under a `database/` folder if you add them. IDE settings reside in `.idea/`.
|
||||
|
||||
## Build, Test, and Development Commands
|
||||
Run the project through Apache in XAMPP or start the PHP built-in server with `php -S localhost:8080 index.php` from this directory. Validate syntax quickly via `php -l webanalyse.php` (repeat for any new file). When iterating on crawl logic, truncate runtime tables with `php setnew.php` to restore the baseline dataset.
|
||||
|
||||
## Coding Style & Naming Conventions
|
||||
Follow PSR-12 style cues already in use: 4-space indentation, brace-on-new-line for functions, and `declare(strict_types=1);` at the top of entry scripts. Favour descriptive camelCase for methods (`getMultipleWebsites`) and snake_case only for direct SQL field names. Maintain `mysqli` usage for consistency, and gate new configuration through constants or clearly named environment variables.
|
||||
|
||||
## Testing Guidelines
|
||||
There is no automated suite yet; treat each crawl as an integration test. After code changes, run `php setnew.php` followed by a crawl and confirm that `crawl`, `urls`, and `links` tables reflect the expected row counts. Log anomalies with `error_log()` while developing, and remove or downgrade to structured responses before merging.
|
||||
|
||||
## Commit & Pull Request Guidelines
|
||||
Author commit messages in the present tense with a concise summary (`Add link grouping for external URLs`). Group related SQL adjustments with their PHP changes in the same commit. For pull requests, include: a short context paragraph, reproduction steps, screenshots of key output tables when behaviour changes, and any follow-up tasks. Link tracking tickets or issues so downstream agents can trace decisions.
|
||||
|
||||
## Security & Configuration Notes
|
||||
Database credentials are currently hard-coded for local XAMPP usage. If you introduce environment-based configuration, document expected `.env` keys and ensure credentials are excluded from version control. Never commit production connection details or raw crawl exports.
|
||||
@@ -38,8 +38,5 @@ RUN chown -R www-data:www-data /var/www/html \
|
||||
# Expose port 80
|
||||
EXPOSE 80
|
||||
|
||||
# Start script
|
||||
COPY start.sh /start.sh
|
||||
RUN chmod +x /start.sh
|
||||
|
||||
CMD ["/start.sh"]
|
||||
# Start PHP-FPM and Nginx
|
||||
CMD php-fpm -D && nginx -g 'daemon off;'
|
||||
|
||||
@@ -14,7 +14,7 @@ server {
|
||||
location ~ \.php$ {
|
||||
try_files $uri =404;
|
||||
fastcgi_split_path_info ^(.+\.php)(/.+)$;
|
||||
fastcgi_pass 127.0.0.1:9000;
|
||||
fastcgi_pass localhost:9000;
|
||||
fastcgi_index index.php;
|
||||
include fastcgi_params;
|
||||
fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name;
|
||||
|
||||
11
index.php
Normal file
11
index.php
Normal file
@@ -0,0 +1,11 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
error_reporting(E_ALL);
|
||||
ini_set('display_errors', '1');
|
||||
|
||||
require_once 'webanalyse.php';
|
||||
$wa = new WebAnalyse();
|
||||
$db = mysqli_connect('localhost', 'root', '', 'screaming_frog');
|
||||
|
||||
$wa->doCrawl(1);
|
||||
11
setnew.php
Normal file
11
setnew.php
Normal file
@@ -0,0 +1,11 @@
|
||||
<?php
|
||||
$db = mysqli_connect("localhost", "root", "", "screaming_frog");
|
||||
|
||||
$db->query("truncate table crawl");
|
||||
// $db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/', 1)");
|
||||
$db->query("insert into crawl (start_url, user_id) values ('https://kies-media.de/leistungen/externer-ausbilder-fuer-fachinformatiker/', 1)");
|
||||
|
||||
$db->query("truncate table urls");
|
||||
$urls = $db->query("insert ignore into urls (id, url, crawl_id) select 1,start_url, id from crawl where id = 1"); #->fetch_all(MYSQLI_ASSOC)
|
||||
|
||||
$db->query("truncate table links");
|
||||
530
webanalyse.php
Normal file
530
webanalyse.php
Normal file
@@ -0,0 +1,530 @@
|
||||
<?php
|
||||
|
||||
declare(strict_types=1);
|
||||
|
||||
/**
|
||||
* Koordiniert Webseiten-Crawls und persistiert Antwortdaten in der Screaming Frog Datenbank.
|
||||
*/
|
||||
class WebAnalyse
|
||||
{
|
||||
private const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36';
|
||||
private const CURL_TIMEOUT = 30;
|
||||
|
||||
/**
|
||||
* @var mysqli Verbindung zur Screaming Frog Datenbank.
|
||||
*/
|
||||
private mysqli $db;
|
||||
|
||||
public function __construct(?mysqli $connection = null)
|
||||
{
|
||||
$connection ??= mysqli_connect('localhost', 'root', '', 'screaming_frog');
|
||||
|
||||
if (!$connection instanceof mysqli) {
|
||||
throw new RuntimeException('Verbindung zur Datenbank konnte nicht hergestellt werden: ' . mysqli_connect_error());
|
||||
}
|
||||
|
||||
$connection->set_charset('utf8mb4');
|
||||
$this->db = $connection;
|
||||
}
|
||||
|
||||
/**
|
||||
* Holt eine einzelne URL und gibt Response-Metadaten zurueck.
|
||||
*
|
||||
* @param string $url Zieladresse fuer den Abruf.
|
||||
* @return array<string,mixed> Antwortdaten oder ein "error"-Schluessel.
|
||||
*/
|
||||
public function getWebsite(string $url): array
|
||||
{
|
||||
$handle = $this->createCurlHandle($url);
|
||||
$response = curl_exec($handle);
|
||||
|
||||
if ($response === false) {
|
||||
$error = curl_error($handle);
|
||||
curl_close($handle);
|
||||
return ['error' => $error];
|
||||
}
|
||||
|
||||
$info = curl_getinfo($handle);
|
||||
curl_close($handle);
|
||||
|
||||
return $this->buildResponsePayload($response, $info);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ruft mehrere URLs parallel via curl_multi ab.
|
||||
*
|
||||
* @param array<int,string> $urls Liste von Ziel-URLs.
|
||||
* @return array<string,array<string,mixed>> Antworten je URL.
|
||||
*/
|
||||
public function getMultipleWebsites(array $urls): array
|
||||
{
|
||||
if ($urls === []) {
|
||||
return [];
|
||||
}
|
||||
|
||||
$results = [];
|
||||
$multiHandle = curl_multi_init();
|
||||
$handles = [];
|
||||
|
||||
foreach ($urls as $url) {
|
||||
$handle = $this->createCurlHandle($url);
|
||||
$handles[$url] = $handle;
|
||||
curl_multi_add_handle($multiHandle, $handle);
|
||||
}
|
||||
|
||||
$running = null;
|
||||
do {
|
||||
$status = curl_multi_exec($multiHandle, $running);
|
||||
} while ($status === CURLM_CALL_MULTI_PERFORM);
|
||||
|
||||
while ($running && $status === CURLM_OK) {
|
||||
if (curl_multi_select($multiHandle, 1.0) === -1) {
|
||||
usleep(100000);
|
||||
}
|
||||
|
||||
do {
|
||||
$status = curl_multi_exec($multiHandle, $running);
|
||||
} while ($status === CURLM_CALL_MULTI_PERFORM);
|
||||
}
|
||||
|
||||
foreach ($handles as $url => $handle) {
|
||||
$response = curl_multi_getcontent($handle);
|
||||
|
||||
if ($response === false) {
|
||||
$results[$url] = ['error' => curl_error($handle)];
|
||||
} else {
|
||||
$results[$url] = $this->buildResponsePayload($response, curl_getinfo($handle));
|
||||
}
|
||||
|
||||
curl_multi_remove_handle($multiHandle, $handle);
|
||||
curl_close($handle);
|
||||
}
|
||||
|
||||
curl_multi_close($multiHandle);
|
||||
|
||||
return $results;
|
||||
}
|
||||
|
||||
/**
|
||||
* Persistiert Response-Daten und stoesst die Analyse der gefundenen Links an.
|
||||
*
|
||||
* @param int $crawlID Identifier der Crawl-Session.
|
||||
* @param string $url Ursprung-URL, deren Antwort verarbeitet wird.
|
||||
* @param array<string,mixed> $data Ergebnis der HTTP-Abfrage.
|
||||
*/
|
||||
public function processResults(int $crawlID, string $url, array $data): void
|
||||
{
|
||||
if (isset($data['error'])) {
|
||||
error_log(sprintf('Fehler bei der Analyse von %s: %s', $url, $data['error']));
|
||||
return;
|
||||
}
|
||||
|
||||
$body = (string)($data['body'] ?? '');
|
||||
|
||||
$update = $this->db->prepare(
|
||||
'UPDATE urls
|
||||
SET status_code = ?, response_time = ?, body_size = ?, date = NOW(), body = ?
|
||||
WHERE url = ? AND crawl_id = ?
|
||||
LIMIT 1'
|
||||
);
|
||||
|
||||
if ($update === false) {
|
||||
throw new RuntimeException('Update-Statement konnte nicht vorbereitet werden: ' . $this->db->error);
|
||||
}
|
||||
|
||||
$statusCode = (int)($data['status_code'] ?? 0);
|
||||
$responseTimeMs = (int)round(((float)($data['response_time'] ?? 0)) * 1000);
|
||||
$bodySize = (int)($data['body_size'] ?? strlen($body));
|
||||
|
||||
$update->bind_param('iiissi', $statusCode, $responseTimeMs, $bodySize, $body, $url, $crawlID);
|
||||
$update->execute();
|
||||
$update->close();
|
||||
|
||||
$this->findNewUrls($crawlID, $body, $url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extrahiert Links aus einer Antwort und legt neue URL-Datensaetze an.
|
||||
*
|
||||
* @param int $crawlID Identifier der Crawl-Session.
|
||||
* @param string $body HTML-Koerper der Antwort.
|
||||
* @param string $url Bearbeitete URL, dient als Kontext fuer relative Links.
|
||||
*/
|
||||
public function findNewUrls(int $crawlID, string $body, string $url): void
|
||||
{
|
||||
if ($body === '') {
|
||||
return;
|
||||
}
|
||||
|
||||
$links = $this->extractLinks($body, $url);
|
||||
if ($links === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
$originId = $this->resolveUrlId($crawlID, $url);
|
||||
if ($originId === null) {
|
||||
return;
|
||||
}
|
||||
|
||||
$deleteLinksStmt = $this->db->prepare('DELETE FROM links WHERE von = ?');
|
||||
if ($deleteLinksStmt !== false) {
|
||||
$deleteLinksStmt->bind_param('i', $originId);
|
||||
$deleteLinksStmt->execute();
|
||||
$deleteLinksStmt->close();
|
||||
}
|
||||
|
||||
$insertUrlStmt = $this->db->prepare('INSERT IGNORE INTO urls (url, crawl_id) VALUES (?, ?)');
|
||||
$selectUrlStmt = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
|
||||
$insertLinkStmt = $this->db->prepare('INSERT IGNORE INTO links (von, nach, linktext, dofollow) VALUES (?, ?, ?, ?)');
|
||||
|
||||
if (!$insertUrlStmt || !$selectUrlStmt || !$insertLinkStmt) {
|
||||
throw new RuntimeException('Vorbereitete Statements konnten nicht erstellt werden: ' . $this->db->error);
|
||||
}
|
||||
|
||||
foreach ($links as $link) {
|
||||
$absoluteUrl = (string)$link['absolute_url'];
|
||||
|
||||
$insertUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
|
||||
$insertUrlStmt->execute();
|
||||
|
||||
$targetId = $this->db->insert_id;
|
||||
if ($targetId === 0) {
|
||||
$selectUrlStmt->bind_param('si', $absoluteUrl, $crawlID);
|
||||
$selectUrlStmt->execute();
|
||||
$result = $selectUrlStmt->get_result();
|
||||
$targetId = $result ? (int)($result->fetch_assoc()['id'] ?? 0) : 0;
|
||||
}
|
||||
|
||||
if ($targetId === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$linkText = $this->normaliseText((string)($link['text'] ?? ''));
|
||||
$isFollow = (int)(strpos((string)($link['rel'] ?? ''), 'nofollow') !== false ? 0 : 1);
|
||||
|
||||
$insertLinkStmt->bind_param('iisi', $originId, $targetId, $linkText, $isFollow);
|
||||
$insertLinkStmt->execute();
|
||||
}
|
||||
|
||||
$insertUrlStmt->close();
|
||||
$selectUrlStmt->close();
|
||||
$insertLinkStmt->close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Startet einen Crawl-Durchlauf fuer unbehandelte URLs.
|
||||
*
|
||||
* @param int $crawlID Identifier der Crawl-Session.
|
||||
*/
|
||||
public function doCrawl(int $crawlID): void
|
||||
{
|
||||
$statement = $this->db->prepare(
|
||||
'SELECT url FROM urls WHERE crawl_id = ? AND date IS NULL LIMIT 50'
|
||||
);
|
||||
|
||||
if ($statement === false) {
|
||||
return;
|
||||
}
|
||||
|
||||
$statement->bind_param('i', $crawlID);
|
||||
$statement->execute();
|
||||
$result = $statement->get_result();
|
||||
|
||||
if (!$result instanceof mysqli_result) {
|
||||
$statement->close();
|
||||
return;
|
||||
}
|
||||
|
||||
$urls = [];
|
||||
while ($row = $result->fetch_assoc()) {
|
||||
$urls[] = $row['url'];
|
||||
}
|
||||
|
||||
$result->free();
|
||||
$statement->close();
|
||||
|
||||
if ($urls === []) {
|
||||
return;
|
||||
}
|
||||
|
||||
foreach ($this->getMultipleWebsites($urls) as $url => $data) {
|
||||
$this->processResults($crawlID, $url, $data);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parst HTML-Inhalt und liefert eine strukturierte Liste gefundener Links.
|
||||
*
|
||||
* @param string $html Rohes HTML-Dokument.
|
||||
* @param string $baseUrl Basis-URL fuer die Aufloesung relativer Pfade.
|
||||
* @return array<int,array<string,mixed>> Gesammelte Linkdaten.
|
||||
*/
|
||||
public function extractLinks(string $html, string $baseUrl = ''): array
|
||||
{
|
||||
$links = [];
|
||||
|
||||
$dom = new DOMDocument();
|
||||
$previous = libxml_use_internal_errors(true);
|
||||
$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
|
||||
libxml_clear_errors();
|
||||
libxml_use_internal_errors($previous);
|
||||
|
||||
foreach ($dom->getElementsByTagName('a') as $index => $aTag) {
|
||||
$href = trim($aTag->getAttribute('href'));
|
||||
if ($href === '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
$absoluteUrl = $this->resolveUrl($href, $baseUrl);
|
||||
$text = $this->normaliseText(trim($aTag->textContent));
|
||||
$rel = $aTag->getAttribute('rel');
|
||||
$title = $aTag->getAttribute('title');
|
||||
$target = $aTag->getAttribute('target');
|
||||
|
||||
$links[] = [
|
||||
'index' => $index + 1,
|
||||
'href' => $href,
|
||||
'absolute_url' => $absoluteUrl,
|
||||
'text' => $text,
|
||||
'rel' => $rel !== '' ? $rel : null,
|
||||
'title' => $title !== '' ? $title : null,
|
||||
'target' => $target !== '' ? $target : null,
|
||||
'is_external' => $this->isExternalLink($absoluteUrl, $baseUrl),
|
||||
'link_type' => $this->getLinkType($href),
|
||||
'is_internal' => $this->isInternalLink($absoluteUrl, $baseUrl) ? 1 : 0,
|
||||
];
|
||||
}
|
||||
|
||||
return $links;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prueft, ob ein Link aus Sicht der Basis-URL extern ist.
|
||||
*
|
||||
* @param string $href Ziel des Links.
|
||||
* @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
|
||||
* @return bool|null True fuer extern, false fuer intern, null falls undefiniert.
|
||||
*/
|
||||
private function isExternalLink(string $href, string $baseUrl): ?bool
|
||||
{
|
||||
if ($baseUrl === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
|
||||
$linkDomain = parse_url($href, PHP_URL_HOST);
|
||||
|
||||
if ($baseDomain === null || $linkDomain === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return !hash_equals($baseDomain, $linkDomain);
|
||||
}
|
||||
|
||||
/**
|
||||
* Prueft, ob ein Link derselben Domain wie die Basis-URL entspricht.
|
||||
*
|
||||
* @param string $href Ziel des Links.
|
||||
* @param string $baseUrl Ausgangsadresse zur Domainabgleichung.
|
||||
* @return bool|null True fuer intern, false fuer extern, null falls undefiniert.
|
||||
*/
|
||||
private function isInternalLink(string $href, string $baseUrl): ?bool
|
||||
{
|
||||
if ($baseUrl === '') {
|
||||
return null;
|
||||
}
|
||||
|
||||
$baseDomain = parse_url($baseUrl, PHP_URL_HOST);
|
||||
$linkDomain = parse_url($href, PHP_URL_HOST);
|
||||
|
||||
if ($baseDomain === null || $linkDomain === null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return hash_equals($baseDomain, $linkDomain);
|
||||
}
|
||||
|
||||
/**
|
||||
* Leitet den Link-Typ anhand gaengiger Protokolle und Muster ab.
|
||||
*
|
||||
* @param string $href Ziel des Links.
|
||||
* @return string Beschreibender Typ wie "absolute" oder "email".
|
||||
*/
|
||||
private function getLinkType(string $href): string
|
||||
{
|
||||
if ($href === '') {
|
||||
return 'empty';
|
||||
}
|
||||
|
||||
$lower = strtolower($href);
|
||||
if (strpos($lower, 'mailto:') === 0) {
|
||||
return 'email';
|
||||
}
|
||||
if (strpos($lower, 'tel:') === 0) {
|
||||
return 'phone';
|
||||
}
|
||||
if (strpos($lower, '#') === 0) {
|
||||
return 'anchor';
|
||||
}
|
||||
if (strpos($lower, 'javascript:') === 0) {
|
||||
return 'javascript';
|
||||
}
|
||||
if (filter_var($href, FILTER_VALIDATE_URL)) {
|
||||
return 'absolute';
|
||||
}
|
||||
|
||||
return 'relative';
|
||||
}
|
||||
|
||||
/**
|
||||
* Gruppiert Links anhand ihres vorab bestimmten Typs.
|
||||
*
|
||||
* @param array<int,array<string,mixed>> $links Liste der extrahierten Links.
|
||||
* @return array<string,array<int,array<string,mixed>>> Links nach Typ gruppiert.
|
||||
*/
|
||||
public function groupLinksByType(array $links): array
|
||||
{
|
||||
$grouped = [];
|
||||
|
||||
foreach ($links as $link) {
|
||||
$type = (string)($link['link_type'] ?? 'unknown');
|
||||
$grouped[$type][] = $link;
|
||||
}
|
||||
|
||||
return $grouped;
|
||||
}
|
||||
|
||||
/**
|
||||
* Erstellt ein konfiguriertes Curl-Handle fuer einen Request.
|
||||
*
|
||||
* @return CurlHandle
|
||||
*/
|
||||
private function createCurlHandle(string $url)
|
||||
{
|
||||
$handle = curl_init($url);
|
||||
if ($handle === false) {
|
||||
throw new RuntimeException('Konnte Curl-Handle nicht initialisieren: ' . $url);
|
||||
}
|
||||
|
||||
curl_setopt_array($handle, [
|
||||
CURLOPT_URL => $url,
|
||||
CURLOPT_RETURNTRANSFER => true,
|
||||
CURLOPT_HEADER => true,
|
||||
CURLOPT_FOLLOWLOCATION => true,
|
||||
CURLOPT_TIMEOUT => self::CURL_TIMEOUT,
|
||||
CURLOPT_USERAGENT => self::USER_AGENT,
|
||||
CURLOPT_SSL_VERIFYPEER => false,
|
||||
]);
|
||||
|
||||
return $handle;
|
||||
}
|
||||
|
||||
/**
|
||||
* Splittet Header und Body und bereitet das Antwort-Array auf.
|
||||
*
|
||||
* @param string $response Vollstaendige Response inkl. Header.
|
||||
* @param array<string,mixed> $info curl_getinfo Ergebnis.
|
||||
* @return array<string,mixed>
|
||||
*/
|
||||
private function buildResponsePayload(string $response, array $info): array
|
||||
{
|
||||
$headerSize = (int)($info['header_size'] ?? 0);
|
||||
$headers = substr($response, 0, $headerSize);
|
||||
$body = substr($response, $headerSize);
|
||||
|
||||
return [
|
||||
'url' => $info['url'] ?? ($info['redirect_url'] ?? ''),
|
||||
'status_code' => (int)($info['http_code'] ?? 0),
|
||||
'headers_parsed' => $this->parseHeaders($headers),
|
||||
'body' => $body,
|
||||
'response_time' => (float)($info['total_time'] ?? 0.0),
|
||||
'body_size' => strlen($body),
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Wandelt Header-String in ein assoziatives Array um.
|
||||
*
|
||||
* @param string $headers Roh-Header.
|
||||
* @return array<string,string>
|
||||
*/
|
||||
private function parseHeaders(string $headers): array
|
||||
{
|
||||
$parsed = [];
|
||||
foreach (preg_split('/\r?\n/', trim($headers)) as $line) {
|
||||
if ($line === '' || strpos($line, ':') === false) {
|
||||
continue;
|
||||
}
|
||||
|
||||
[$key, $value] = explode(':', $line, 2);
|
||||
$parsed[trim($key)] = trim($value);
|
||||
}
|
||||
|
||||
return $parsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalisiert relativen Pfad gegenueber einer Basis-URL zu einer absoluten Adresse.
|
||||
*/
|
||||
private function resolveUrl(string $href, string $baseUrl): string
|
||||
{
|
||||
if ($href === '' || filter_var($href, FILTER_VALIDATE_URL)) {
|
||||
return $href;
|
||||
}
|
||||
|
||||
if ($baseUrl === '') {
|
||||
return $href;
|
||||
}
|
||||
|
||||
$baseParts = parse_url($baseUrl);
|
||||
if ($baseParts === false || !isset($baseParts['scheme'], $baseParts['host'])) {
|
||||
return $href;
|
||||
}
|
||||
|
||||
$scheme = $baseParts['scheme'];
|
||||
$host = $baseParts['host'];
|
||||
$port = isset($baseParts['port']) ? ':' . $baseParts['port'] : '';
|
||||
$basePath = $baseParts['path'] ?? '/';
|
||||
|
||||
if (strpos($href, '/') === 0) {
|
||||
$path = $href;
|
||||
} else {
|
||||
if (substr($basePath, -1) !== '/') {
|
||||
$basePath = preg_replace('#/[^/]*$#', '/', $basePath) ?: '/';
|
||||
}
|
||||
$path = $basePath . $href;
|
||||
}
|
||||
|
||||
return sprintf('%s://%s%s%s', $scheme, $host, $port, '/' . ltrim($path, '/'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Sorgt fuer sauberen UTF-8 Text ohne Steuerzeichen.
|
||||
*/
|
||||
private function normaliseText(string $text): string
|
||||
{
|
||||
$normalized = preg_replace('/\s+/u', ' ', $text) ?? '';
|
||||
$encoding = mb_detect_encoding($normalized, ['UTF-8', 'ISO-8859-1', 'Windows-1252'], true) ?: 'UTF-8';
|
||||
|
||||
return trim(mb_convert_encoding($normalized, 'UTF-8', $encoding));
|
||||
}
|
||||
|
||||
/**
|
||||
* Ermittelt die ID einer URL innerhalb eines Crawl-Durchlaufs.
|
||||
*/
|
||||
private function resolveUrlId(int $crawlID, string $url): ?int
|
||||
{
|
||||
$statement = $this->db->prepare('SELECT id FROM urls WHERE url = ? AND crawl_id = ? LIMIT 1');
|
||||
if ($statement === false) {
|
||||
return null;
|
||||
}
|
||||
|
||||
$statement->bind_param('si', $url, $crawlID);
|
||||
$statement->execute();
|
||||
$result = $statement->get_result();
|
||||
$id = $result ? $result->fetch_assoc()['id'] ?? null : null;
|
||||
$statement->close();
|
||||
|
||||
return $id !== null ? (int)$id : null;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user