웹사이트 크롤러후 파일 압축 다운로드 > 자료실

웹사이트 크롤러후 파일 압축 다운로드

페이지 정보

본문

<h1>웹 페이지 크롤링 및 파일 압축 다운로드</h1>
<form action="" method="post">
	<label for="url">크롤링할 웹 페이지 URL:</label>

	<input type="text" name="url" id="url" size="50" required>

	<button type="submit" name="submit">크롤링 및 파일 압축 다운로드</button>
</form>


<?php
function isCrawlingAllowed($url) {
	$parsedUrl = parse_url($url);
	$robotsUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . '/robots.txt';

	$robotsContent = @file_get_contents($robotsUrl);
	if ($robotsContent === false) {
		return true; // robots.txt 파일이 없는 경우 크롤링 허용
	}

	$allow = true;
	$disallowPaths = array();
	$lines = explode("\n", $robotsContent);
	foreach ($lines as $line) {
		if (strpos($line, 'Disallow:') === 0) {
			$disallowPath = trim(substr($line, strlen('Disallow:')));
			if (!empty($disallowPath)) {
				$disallowPaths[] = $disallowPath;
			}
		}
	}

	// 확인하려는 경로가 Disallow 경로인지 체크
	foreach ($disallowPaths as $path) {
		if (strpos($url, $path) !== false) {
			$allow = false;
			break;
		}
	}
	return $allow;
}

if (isset($_POST["submit"])) {
	if ($_SERVER["REQUEST_METHOD"] == "POST") {
		// 클라이언트로부터 URL 또는 도메인 입력 받기
		$input = $_POST["url"];
		$url = filter_var($input, FILTER_VALIDATE_URL) ? $input : getDomain($input);

		// URL 유효성 검사 (URL 형식 또는 도메인 형식인지 확인)
		if (filter_var($url, FILTER_VALIDATE_URL) === false) {
			die("유효하지 않은 URL 또는 도메인입니다.");
		}

		// 웹 페이지 HTML 내용 가져오기
		$html = file_get_contents($url);

		// 웹 페이지에서 모든 링크 추출
		$linkFiles = array();
		preg_match_all('/<a\s+href="([^"]+)"/i', $html, $matches);
		foreach ($matches[1] as $linkUrl) {
			$linkFiles[] = $linkUrl;
		}

	// 웹 페이지에서 CSS 파일 추출
	$cssFiles = array();
	preg_match_all('/<link\s+rel="stylesheet"\s+href="([^"]+)"/i', $html, $matches);
	foreach ($matches[1] as $cssUrl) {
		$cssContent = file_get_contents($cssUrl);
		$cssFilename = basename($cssUrl);
		file_put_contents($cssFilename, $cssContent);
		$cssFiles[] = $cssFilename;
	}

	// 웹 페이지에서 JavaScript 파일 추출
	$jsFiles = array();
	preg_match_all('/<script\s+src="([^"]+)"/i', $html, $matches);
	foreach ($matches[1] as $jsUrl) {
		$jsContent = file_get_contents($jsUrl);
		$jsFilename = basename($jsUrl);
		file_put_contents($jsFilename, $jsContent);
		$jsFiles[] = $jsFilename;
	}

	// 웹 페이지에서 이미지 URL 추출
		$imageFiles = array();
		preg_match_all('/<img\s+src="([^"]+)"/i', $html, $matches);
		foreach ($matches[1] as $imageUrl) {
			// 이미지 파일 다운로드
			$imageContent = file_get_contents($imageUrl);
			$imageFilename = basename($imageUrl);
			file_put_contents($imageFilename, $imageContent);
			$imageFiles[] = $imageFilename;
		}

	// HTML, CSS, JS, 이미지 파일들을 Gzip 압축하여 저장
		$zipFilename = "crawled_files.zip";

		// ZipArchive 클래스를 사용하여 압축 파일 생성
		$zip = new ZipArchive();
		if ($zip->open($zipFilename, ZipArchive::CREATE) === true) {
			$zip->addFromString('crawled_page.html', $html);
			// 링크 파일들을 텍스트 파일로 압축 파일에 추가
			$linkText = implode(PHP_EOL, $linkFiles);
			$zip->addFromString('links.txt', $linkText);
			foreach ($cssFiles as $cssFilename) {
				$zip->addFile($cssFilename);
			}
			foreach ($jsFiles as $jsFilename) {
				$zip->addFile($jsFilename);
			}
			foreach ($imageFiles as $imageFilename) {
				$zip->addFile($imageFilename);
			}
			$zip->close();

			// 다운로드 처리 (gzip이 아닌 zip 파일로 변경)
			header('Content-Type: application/zip');
			header('Content-Disposition: attachment; filename="' . $zipFilename . '"');
			readfile($zipFilename);

			// 임시 파일 삭제
			unlink('crawled_page.html');
			foreach ($cssFiles as $cssFilename) {
				unlink($cssFilename);
			}
			foreach ($jsFiles as $jsFilename) {
				unlink($jsFilename);
			}
			foreach ($imageFiles as $imageFilename) {
				unlink($imageFilename);
			}
			unlink($zipFilename);
		} else {
			echo "압축 파일을 생성할 수 없습니다.";
		}
	}
}
?>

로그인

회원로그인

로그인

웹사이트 크롤러후 파일 압축 다운로드 > 자료실

인기검색어

회원로그인

오늘 본 게시물

오늘 본 페이지

오늘 본 C.m.A Util

자료실

C.m.A Lib

자료실

웹사이트 크롤러후 파일 압축 다운로드

페이지 정보

본문