Commit 1024dc56 by 白满斌

ss

parent e5ebd9a2
<?php
$html = file_get_contents( './list.html');
$dom = new DOMDocument();
@$dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
$xpath = new DOMXPath($dom);
echo "=== 考试类型链接 ===\n";
$examNodes = $xpath->query("//dl[dt='考试类型:']//ul[@class='type-list']/li/a");
foreach ($examNodes as $node) {
echo trim($node->nodeValue) . ": " . $node->getAttribute('href') . "\n";
}
echo "\n=== 公告列表 ===\n";
$noticeNodes = $xpath->query("//div[@class='mdn-content-l']//ul[@class='link-list']/li");
// 最佳方法:查找主内容区的公告列表
$mainContent = $xpath->query("//div[contains(@class, 'mdn-content-l')]//ul[@class='link-list']/li");
echo "主内容区公告数量: " . $mainContent->length . "\n";
$allData = [];
if ($mainContent->length > 0) {
foreach ($mainContent as $index => $node) {
// 提取所有标签
$labels = [];
$labelNodes = $xpath->query(".//i[@class='notice-label']", $node);
foreach ($labelNodes as $labelNode) {
$label = trim($labelNode->nodeValue);
$label = str_replace(['[', ']'], '', $label);
$labels[] = $label;
}
// 提取标题和链接
$linkNode = $xpath->query(".//a", $node)->item(0);
if ($linkNode) {
$title = trim($linkNode->nodeValue);
$url = $linkNode->getAttribute('href');
$labels['title'] = $title;
$labels['url'] = $url;
// 提取日期
$timeNode = $xpath->query(".//time", $node)->item(0);
$date = $timeNode ? trim($timeNode->nodeValue) : '';
$labels['date'] = $date;
// // 格式化输出
// if (count($labels) >= 2) {
// // 通常第一个是地区,第二个是考试类型
// echo "[{$labels[0]}]-[{$labels[1]}]-$title:$url\n";
// } elseif (count($labels) == 1) {
// echo "[{$labels[0]}]-[未知]-$title:$url\n";
// } else {
// echo "[未知]-[未知]-$title:$url\n";
// }
}
$allData[] = $labels;
}
} else {
echo "未找到公告列表!尝试备用方法...\n";
// 备用方法:查找所有可能包含公告的li
$allLi = $xpath->query("//li");
$noticeCount = 0;
foreach ($allLi as $node) {
// 检查是否包含notice-label
$labelNodes = $xpath->query(".//i[@class='notice-label']", $node);
$linkNode = $xpath->query(".//a", $node)->item(0);
if ($labelNodes->length > 0 && $linkNode) {
$noticeCount++;
// 提取标签
$labels = [];
foreach ($labelNodes as $labelNode) {
$label = trim($labelNode->nodeValue);
$label = str_replace(['[', ']'], '', $label);
$labels[] = $label;
}
$title = trim($linkNode->nodeValue);
$url = $linkNode->getAttribute('href');
$labels['title'] = $title;
$labels['url'] = $url;
if (count($labels) >= 2) {
echo "[{$labels[0]}]-[{$labels[1]}]-$title:$url\n";
}
}
}
echo "通过备用方法找到公告数量: $noticeCount\n";
}
var_dump($allData);die;
?>
\ No newline at end of file
<?php
class HtmlExtractor {
private $dom;
private $xpath;
public function __construct($html) {
$this->dom = new DOMDocument();
@$this->dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
$this->xpath = new DOMXPath($this->dom);
}
// 提取考试类型HTML
public function getExamTypeHtml($includeWrapper = false) {
var_dump($this->dom);die;
if ($includeWrapper) {
$node = $this->xpath->query("//div[@class='type-info']")->item(0);
} else {
$node = $this->xpath->query("//dl[dt='考试类型:']")->item(0);
}
return $node ? $this->getOuterHtml($node) : '';
}
// 提取资讯类型HTML
public function getNewsTypeHtml() {
$node = $this->xpath->query("//dl[dt='资讯类型:']")->item(0);
return $node ? $this->getOuterHtml($node) : '';
}
// 提取公告列表HTML
public function getNoticeListHtml($includePagination = true) {
if ($includePagination) {
$node = $this->xpath->query("//div[@class='md-notice']")->item(0);
} else {
$node = $this->xpath->query("//div[@class='notice-list']")->item(0);
}
return $node ? $this->getOuterHtml($node) : '';
}
// 提取分页HTML
public function getPaginationHtml() {
$node = $this->xpath->query("//div[@class='pagelist-box']")->item(0);
return $node ? $this->getOuterHtml($node) : '';
}
// 提取热点考试HTML
public function getHotExamHtml() {
$node = $this->xpath->query("//div[@class='md-hot-recommend'][div/span[@class='title-name' and text()='热点考试']]")->item(0);
return $node ? $this->getOuterHtml($node) : '';
}
// 提取热门职位HTML
public function getHotJobHtml() {
$node = $this->xpath->query("//div[@class='md-hot-recommend'][div/span[@class='title-name' and text()='热门职位']]")->item(0);
return $node ? $this->getOuterHtml($node) : '';
}
// 提取完整的侧边栏HTML
public function getSidebarHtml() {
$node = $this->xpath->query("//aside[@class='mdn-aside']")->item(0);
return $node ? $this->getOuterHtml($node) : '';
}
// 获取节点外层HTML
private function getOuterHtml($node) {
return $this->dom->saveHTML($node);
}
// 获取节点内层HTML
private function getInnerHtml($node) {
$innerHTML = '';
foreach ($node->childNodes as $child) {
$innerHTML .= $this->dom->saveHTML($child);
}
return $innerHTML;
}
// 提取所有考试类型链接
public function getExamTypeLinks() {
$links = [];
$nodes = $this->xpath->query("//dl[dt='考试类型:']//ul[@class='type-list']/li/a");
foreach ($nodes as $node) {
$links[] = [
'text' => trim($node->nodeValue),
'href' => $node->getAttribute('href'),
'html' => $this->dom->saveHTML($node)
];
}
return $links;
}
// 提取所有公告项
public function getNoticeItems() {
$items = [];
$nodes = $this->xpath->query("//div[@class='notice-list']//li");
foreach ($nodes as $index => $node) {
$items[] = [
'index' => $index,
'html' => $this->dom->saveHTML($node),
'inner_html' => $this->getInnerHtml($node)
];
}
return $items;
}
// 提取并格式化所有数据
public function extractAll() {
return [
'exam_type' => [
'html' => $this->getExamTypeHtml(true),
'links' => $this->getExamTypeLinks()
],
'news_type' => [
'html' => $this->getNewsTypeHtml()
],
'notice_list' => [
'html' => $this->getNoticeListHtml(true),
'items_html' => $this->getNoticeListHtml(false),
'items' => $this->getNoticeItems()
],
'pagination' => [
'html' => $this->getPaginationHtml()
],
'sidebar' => [
'html' => $this->getSidebarHtml(),
'hot_exam' => $this->getHotExamHtml(),
'hot_job' => $this->getHotJobHtml()
]
];
}
}
// 使用示例
$html = './list.html'; // 这里放入您提供的HTML
$extractor = new HtmlExtractor($html);
// 1. 获取所有提取的数据
$data = $extractor->getExamTypeHtml();
var_dump($data);die;
// 2. 分别保存各部分HTML
file_put_contents('exam_type_full.html', $data['exam_type']['html']);
file_put_contents('news_type.html', $data['news_type']['html']);
file_put_contents('notice_list.html', $data['notice_list']['html']);
file_put_contents('notice_items_only.html', $data['notice_list']['items_html']);
file_put_contents('pagination.html', $data['pagination']['html']);
// 3. 生成完整的HTML页面
//$fullPage = $extractor->generateHtmlPage();
//file_put_contents('extracted_data.html', $fullPage);
//
//// 4. 输出各部分信息
//echo "=== 提取完成 ===\n";
//echo "考试类型HTML长度: " . strlen($data['exam_type']['html']) . " 字节\n";
//echo "资讯类型HTML长度: " . strlen($data['news_type']['html']) . " 字节\n";
//echo "公告列表HTML长度: " . strlen($data['notice_list']['html']) . " 字节\n";
//echo "公告项数量: " . count($data['notice_list']['items']) . " 条\n";
//echo "考试类型链接数量: " . count($data['exam_type']['links']) . " 个\n\n";
//
//echo "已保存以下文件:\n";
//echo "- exam_type_full.html (考试类型完整HTML)\n";
//echo "- news_type.html (资讯类型HTML)\n";
//echo "- notice_list.html (公告列表完整HTML)\n";
//echo "- notice_items_only.html (仅公告项目HTML)\n";
//echo "- pagination.html (分页HTML)\n";
//echo "- extracted_data.html (完整提取结果页面)\n";
//
//// 5. 查看提取的链接
//echo "\n=== 考试类型链接 ===\n";
//foreach ($data['exam_type']['links'] as $link) {
// echo "- " . $link['text'] . ": " . $link['href'] . "\n";
//}
//
//// 6. 输出部分公告HTML作为预览
//echo "\n=== 公告预览(前3条) ===\n";
//for ($i = 0; $i < min(3, count($data['notice_list']['items'])); $i++) {
// echo "【公告 " . ($i + 1) . "】\n";
// echo htmlspecialchars(substr($data['notice_list']['items'][$i]['html'], 0, 200)) . "...\n\n";
//}
?>
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment