웹사이트 크롤러후 파일 압축 다운로드2 > 자료실

웹사이트 크롤러후 파일 압축 다운로드2

페이지 정보

본문

<?php
ob_start("ob_gzhandler"); // gzip 압축 사용
//위의 코드를 통해 스타일 적용시 오류가 발생하는 것을 해결
// 입력으로부터 도메인 추출하는 함수 아래
function getDomain($input) {
	// 입력에서 http:// 또는 https:// 제거
	$input = preg_replace('#^https?://#', '', $input);
	// 입력에서 www. 제거
	$input = preg_replace('#^www\.#', '', $input);
	// / 이후 부분 제거
	$input = preg_replace('#/.*$#', '', $input);
	return $input;
}
function isCrawlingAllowed($url) {
	$parsedUrl = parse_url($url);
	$robotsUrl = $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . '/robots.txt';
	$robotsContent = @file_get_contents($robotsUrl);
	if ($robotsContent === false) {
		return true; // robots.txt 파일이 없는 경우 크롤링 허용
	}
	$allow = true;
	$disallowPaths = array();
	$lines = explode("\n", $robotsContent);
	foreach ($lines as $line) {
		if (strpos($line, 'Disallow:') === 0) {
			$disallowPath = trim(substr($line, strlen('Disallow:')));
			if (!empty($disallowPath)) {
				$disallowPaths[] = $disallowPath;
			}
		}
	}
	// 확인하려는 경로가 Disallow 경로인지 체크
	foreach ($disallowPaths as $path) {
		if (strpos($url, $path) !== false) {
			$allow = false;
			break;
		}
	}
	return $allow;
 
	}
   if (isset($_POST["submit"])) {
	if ($_SERVER["REQUEST_METHOD"] == "POST") {
		// 클라이언트로부터 URL 또는 도메인 입력 받기
		$input = $_POST["url"];
		$url = filter_var($input, FILTER_VALIDATE_URL) ? $input : getDomain($input);
		// URL 유효성 검사 (URL 형식 또는 도메인 형식인지 확인)
		if (filter_var($url, FILTER_VALIDATE_URL) === false) {
			die("유효하지 않은 URL입니다.");
		}
		// 웹 페이지 HTML 내용 가져오기
		$html = file_get_contents($url);
		 
		// 사용자가 선택한 항목 확인
		$crawling_items = $_POST["crawling_items"];
 
		// 웹 페이지에서 링크 추출
		$linkFiles = array();
		if (in_array("links", $crawling_items)) {
			preg_match_all('/<a\s+href="([^"]+)"/i', $html, $matches);
			foreach ($matches[1] as $linkUrl) {
				$linkFiles[] = $linkUrl;
			}
		}
 
		// 웹 페이지에서 CSS 파일 추출
		$cssFiles = array();
		if (in_array("css", $crawling_items)) {
			preg_match_all('/<link\s+rel="stylesheet"\s+href="([^"]+)"/i', $html, $matches);
			foreach ($matches[1] as $cssUrl) {
				$cssContent = file_get_contents($cssUrl);
				$cssFilename = basename($cssUrl);
				file_put_contents($cssFilename, $cssContent);
				$cssFiles[] = $cssFilename;
			}
		}
 
		// 웹 페이지에서 JavaScript 파일 추출
		$jsFiles = array();
		if (in_array("js", $crawling_items)) {
			preg_match_all('/<script\s+src="([^"]+)"/i', $html, $matches);
			foreach ($matches[1] as $jsUrl) {
				$jsContent = file_get_contents($jsUrl);
				$jsFilename = basename($jsUrl);
				file_put_contents($jsFilename, $jsContent);
				$jsFiles[] = $jsFilename;
			}
		}
 
		// 웹 페이지에서 이미지 URL 추출
		$imageFiles = array();
		if (in_array("images", $crawling_items)) {
			preg_match_all('/<img\s+src="([^"]+)"/i', $html, $matches);
			foreach ($matches[1] as $imageUrl) {
				// 이미지 파일 다운로드
				$imageContent = file_get_contents($imageUrl);
				$imageFilename = basename($imageUrl);
				file_put_contents($imageFilename, $imageContent);
				$imageFiles[] = $imageFilename;
			}
		}
		// HTML, CSS, JS, 이미지 파일들을 Gzip 압축하여 저장
			$zipFilename = "crawled_files.zip";
			// ZipArchive 클래스를 사용하여 압축 파일 생성
			$zip = new ZipArchive();
			if ($zip->open($zipFilename, ZipArchive::CREATE) === true) {
				$zip->addFromString('crawled_page.html', $html);
				// 링크 파일들을 텍스트 파일로 압축 파일에 추가
				$linkText = implode(PHP_EOL, $linkFiles);
				$zip->addFromString('links.txt', $linkText);
				foreach ($cssFiles as $cssFilename) {
					$zip->addFile($cssFilename);
				}
				foreach ($jsFiles as $jsFilename) {
					$zip->addFile($jsFilename);
				}
				foreach ($imageFiles as $imageFilename) {
					$zip->addFile($imageFilename);
				}
				$zip->close();
				// 다운로드 처리 (gzip이 아닌 zip 파일로 변경)
				header('Content-Type: application/zip');
				header('Content-Disposition: attachment; filename="' . $zipFilename . '"');
				readfile($zipFilename);
				// 임시 파일 삭제
				unlink('crawled_page.html');
				foreach ($cssFiles as $cssFilename) {
					unlink($cssFilename);
				}
				foreach ($jsFiles as $jsFilename) {
					unlink($jsFilename);
				}
				foreach ($imageFiles as $imageFilename) {
					unlink($imageFilename);
				}
				unlink($zipFilename);
			} else {
				echo "압축 파일을 생성할 수 없습니다.";
			}
		}
	}
	?>

<style>
body {
	font-family: Arial, sans-serif;
	line-height: 1.6;
	max-width: 800px;
	margin: 0 auto;
	padding: 20px;
	background-color: #f2f2f2;
}
h1 {
	text-align: center;
	margin-bottom: 30px;
	color: #0066cc;
}
form {
	margin-bottom: 30px;
}
label {
	display: block;
	font-weight: bold;
	margin-bottom: 5px;
	color: #0066cc;
}
input[type="text"] {
	width: 100%;
	padding: 10px;
	font-size: 16px;
	border: 1px solid #ccc;
	border-radius: 5px;
	margin-bottom: 10px;
}
input[type="checkbox"] {
	margin-right: 5px;
}
button {
	background-color: #0066cc;
	color: white;
	padding: 10px 20px;
	border: none;
	border-radius: 5px;
	cursor: pointer;
}
button:hover {
	background-color: #0052a3;
}
h2 {
	font-size: 1em;
	margin-top: 30px;
	color: #0066cc;
}
h3 {
	font-size: 1em;
	margin-top: 50px;
	color: #0066cc;
	text-align:center
}
a {
	text-decoration: none;
	color: #0066cc;
}
</style>


<h1>The 웹 크롤러; 웹 페이지 크롤링 및 다운로드</h1>
<form action="" method="post">
	<label for="url">크롤링할 웹 페이지 URL:</label>

	<input type="text" name="url" id="url" size="50" placeholder='https://example.com'  required>

	<!-- 체크박스로 원하는 항목 선택 -->
	<input type="checkbox" name="crawling_items[]" value="links" id="links">
	<label for="links">링크 크롤링(선택)</label>


	<input type="checkbox" name="crawling_items[]" value="css" id="css">
	<label for="css">CSS 파일 크롤링(선택)</label>


	<input type="checkbox" name="crawling_items[]" value="js" id="js">
	<label for="js">JavaScript 파일 크롤링(선택)</label>


	<input type="checkbox" name="crawling_items[]" value="images" id="images">
	<label for="images">이미지 파일 크롤링(선택)</label>

	<button type="submit" name="submit">크롤링 및 파일 압축 다운로드</button>
</form>
<h2 style='margin-top:25%'>특징 및 사용방법:</h2>
<h2>- 특징</h2>
<ul>
	<li>크롤링 파일을 실행하는 서버가 그누보드5를 지원할 경우 바로 이용 가능합니다.</li>
	<li>지원하지 않는다면 gd라이브러리, gzip 등을 설치해주세요. dsclub.kr에 설치방법이 있습니다.</li>
	<li>입력한 웹 페이지의 HTML, LINKS, CSS, JS,  이미지 파일을 크롤링하고 다운로드할 수 있습니다.</li>
	<li>크롤링한 링크 목록은 `links.txt` 파일로 저장되어 함께 제공됩니다.</li>
	<li>파일 다운로드 시 Zip 파일 형식으로 압축하여 제공됩니다.</li>
	<li>크롤링 대상 서버의robots.txt 수집 동의 여부에 따라 크롤링하는 합법적인 크롤러입니다.</li>
</ul>
 
<h2>- 사용법</h2>
<ol>
	<li>웹 서버의 root폴더 ex.) /var/www/html의 html에 crawled.php가 위치하게 해주세요.</li>
	<li>크롤링하고자 하는 웹 페이지의 URL(링크)을 입력하고, "크롤링 및 다운로드" 버튼을 클릭합니다.</li>
	<li>웹 페이지 크롤링이 진행되며 크롤링 완료 후 크롤링 결과물은 압축 파일로 다운로드합니다.</li>
	<li>크롤링할 웹 페이지 URL 입력: 웹 크롤링을 원하는 웹 페이지의 URL을 입력해주세요.</li>
	<li>링크 크롤링(선택): 웹 페이지에 포함된 모든 링크 URL을 크롤링합니다.</li>
	<li>CSS 파일 크롤링(선택): 웹 페이지에 포함된 모든 CSS 파일을 크롤링합니다.</li>
	<li>JavaScript 파일 크롤링(선택): 웹 페이지에 포함된 모든 JavaScript 파일을 크롤링합니다.</li>
	<li>이미지 크롤링(선택): 웹 페이지에 포함된 모든 이미지를 크롤링합니다.</li>
</ol>
 
 <h2>참고*</h2>
 <ul>
	<li>html파일은 선택하지 않아도 기본값으로 크롤링하게 되어있습니다.</li>
	<li>'The 웹 크롤러 사용으로 인한 법적 분쟁, 서버 손상 등은 dsclub.kr의 Tak2가 책임지지 않습니다.</li>
	<li>업데이트는 비정기적으로 이루어집니다. -업데이트 확인: <a href='https://dsclub.kr'>dsclub.kr</a></li>
	<li>The 웹 크롤러를 무단 수정하여 배포하지 말아주세요, 관련 문의: dsclub2023@gmail.com</li>
	<li>오류 등 기타 문의: dsclub2023@gmail.com</li>
	<li>Produced by Tak2를 제거하지 말아주세요.</li>
 </ul>
<h3 style='font-size:1em'><a href='https://dsclub.kr'>Produced by Tak2</a></h3>


<!--input 텍스트 표시 기능{-->
<script>
	// 입력란에 포커스가 있을 때 플레이스홀더를 지움
	document.getElementById("inputField").addEventListener("focus", function() {
		this.placeholder = "";
	});

	// 입력란에서 포커스가 벗어날 때 플레이스홀더를 다시 표시
	document.getElementById("inputField").addEventListener("blur", function() {
		this.placeholder = "https://example.com";
	});
</script>
<!--}input-->

로그인

회원로그인

로그인

웹사이트 크롤러후 파일 압축 다운로드2 > 자료실

인기검색어

회원로그인

오늘 본 게시물

오늘 본 페이지

오늘 본 C.m.A Util

자료실

C.m.A Lib

자료실

웹사이트 크롤러후 파일 압축 다운로드2

페이지 정보

본문