ss

1024dc56 · 白满斌 · e5ebd9a2 · 1024dc56 · 1024dc56 · 1024dc56
Commit 1024dc56 authored Dec 04, 2025 by 白满斌
10 changed files
--- a/public/list.html
+++ b/public/list.html
--- a/public/t.php
+++ b/public/t.php
+<?php
+$html = file_get_contents( './list.html');
+
+$dom = new DOMDocument();
+@$dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
+$xpath = new DOMXPath($dom);
+
+echo "=== 考试类型链接 ===\n";
+$examNodes = $xpath->query("//dl[dt='考试类型：']//ul[@class='type-list']/li/a");
+foreach ($examNodes as $node) {
+    echo trim($node->nodeValue) . ": " . $node->getAttribute('href') . "\n";
+}
+
+echo "\n=== 公告列表 ===\n";
+$noticeNodes = $xpath->query("//div[@class='mdn-content-l']//ul[@class='link-list']/li");
+
+// 最佳方法：查找主内容区的公告列表
+$mainContent = $xpath->query("//div[contains(@class, 'mdn-content-l')]//ul[@class='link-list']/li");
+echo "主内容区公告数量: " . $mainContent->length . "\n";
+$allData = [];
+if ($mainContent->length > 0) {
+    foreach ($mainContent as $index => $node) {
+
+        // 提取所有标签
+        $labels = [];
+        $labelNodes = $xpath->query(".//i[@class='notice-label']", $node);
+
+        foreach ($labelNodes as $labelNode) {
+            $label = trim($labelNode->nodeValue);
+            $label = str_replace(['[', ']'], '', $label);
+            $labels[] = $label;
+        }
+
+        // 提取标题和链接
+        $linkNode = $xpath->query(".//a", $node)->item(0);
+        if ($linkNode) {
+            $title = trim($linkNode->nodeValue);
+            $url = $linkNode->getAttribute('href');
+            $labels['title'] = $title;
+            $labels['url'] = $url;
+
+            // 提取日期
+            $timeNode = $xpath->query(".//time", $node)->item(0);
+            $date = $timeNode ? trim($timeNode->nodeValue) : '';
+            $labels['date'] = $date;
+
+//            // 格式化输出
+//            if (count($labels) >= 2) {
+//                // 通常第一个是地区，第二个是考试类型
+//                echo "[{$labels[0]}]-[{$labels[1]}]-$title:$url\n";
+//            } elseif (count($labels) == 1) {
+//                echo "[{$labels[0]}]-[未知]-$title:$url\n";
+//            } else {
+//                echo "[未知]-[未知]-$title:$url\n";
+//            }
+        }
+
+        $allData[] = $labels;
+
+    }
+
+} else {
+    echo "未找到公告列表！尝试备用方法...\n";
+
+    // 备用方法：查找所有可能包含公告的li
+    $allLi = $xpath->query("//li");
+    $noticeCount = 0;
+
+    foreach ($allLi as $node) {
+        // 检查是否包含notice-label
+        $labelNodes = $xpath->query(".//i[@class='notice-label']", $node);
+        $linkNode = $xpath->query(".//a", $node)->item(0);
+
+        if ($labelNodes->length > 0 && $linkNode) {
+            $noticeCount++;
+
+            // 提取标签
+            $labels = [];
+            foreach ($labelNodes as $labelNode) {
+                $label = trim($labelNode->nodeValue);
+                $label = str_replace(['[', ']'], '', $label);
+                $labels[] = $label;
+            }
+
+            $title = trim($linkNode->nodeValue);
+            $url = $linkNode->getAttribute('href');
+            $labels['title'] = $title;
+            $labels['url'] = $url;
+            if (count($labels) >= 2) {
+                echo "[{$labels[0]}]-[{$labels[1]}]-$title:$url\n";
+            }
+        }
+    }
+
+    echo "通过备用方法找到公告数量: $noticeCount\n";
+}
+var_dump($allData);die;
+
+
+
+?>
\ No newline at end of file
--- a/public/test.php
+++ b/public/test.php
+<?php
+class HtmlExtractor {
+    private $dom;
+    private $xpath;
+
+    public function __construct($html) {
+        $this->dom = new DOMDocument();
+        @$this->dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
+        $this->xpath = new DOMXPath($this->dom);
+    }
+
+    // 提取考试类型HTML
+    public function getExamTypeHtml($includeWrapper = false) {
+        var_dump($this->dom);die;
+        if ($includeWrapper) {
+            $node = $this->xpath->query("//div[@class='type-info']")->item(0);
+        } else {
+            $node = $this->xpath->query("//dl[dt='考试类型：']")->item(0);
+        }
+        return $node ? $this->getOuterHtml($node) : '';
+    }
+
+    // 提取资讯类型HTML
+    public function getNewsTypeHtml() {
+        $node = $this->xpath->query("//dl[dt='资讯类型：']")->item(0);
+        return $node ? $this->getOuterHtml($node) : '';
+    }
+
+    // 提取公告列表HTML
+    public function getNoticeListHtml($includePagination = true) {
+        if ($includePagination) {
+            $node = $this->xpath->query("//div[@class='md-notice']")->item(0);
+        } else {
+            $node = $this->xpath->query("//div[@class='notice-list']")->item(0);
+        }
+        return $node ? $this->getOuterHtml($node) : '';
+    }
+
+    // 提取分页HTML
+    public function getPaginationHtml() {
+        $node = $this->xpath->query("//div[@class='pagelist-box']")->item(0);
+        return $node ? $this->getOuterHtml($node) : '';
+    }
+
+    // 提取热点考试HTML
+    public function getHotExamHtml() {
+        $node = $this->xpath->query("//div[@class='md-hot-recommend'][div/span[@class='title-name' and text()='热点考试']]")->item(0);
+        return $node ? $this->getOuterHtml($node) : '';
+    }
+
+    // 提取热门职位HTML
+    public function getHotJobHtml() {
+        $node = $this->xpath->query("//div[@class='md-hot-recommend'][div/span[@class='title-name' and text()='热门职位']]")->item(0);
+        return $node ? $this->getOuterHtml($node) : '';
+    }
+
+    // 提取完整的侧边栏HTML
+    public function getSidebarHtml() {
+        $node = $this->xpath->query("//aside[@class='mdn-aside']")->item(0);
+        return $node ? $this->getOuterHtml($node) : '';
+    }
+
+    // 获取节点外层HTML
+    private function getOuterHtml($node) {
+        return $this->dom->saveHTML($node);
+    }
+
+    // 获取节点内层HTML
+    private function getInnerHtml($node) {
+        $innerHTML = '';
+        foreach ($node->childNodes as $child) {
+            $innerHTML .= $this->dom->saveHTML($child);
+        }
+        return $innerHTML;
+    }
+
+    // 提取所有考试类型链接
+    public function getExamTypeLinks() {
+        $links = [];
+        $nodes = $this->xpath->query("//dl[dt='考试类型：']//ul[@class='type-list']/li/a");
+
+        foreach ($nodes as $node) {
+            $links[] = [
+                'text' => trim($node->nodeValue),
+                'href' => $node->getAttribute('href'),
+                'html' => $this->dom->saveHTML($node)
+            ];
+        }
+
+        return $links;
+    }
+
+    // 提取所有公告项
+    public function getNoticeItems() {
+        $items = [];
+        $nodes = $this->xpath->query("//div[@class='notice-list']//li");
+
+        foreach ($nodes as $index => $node) {
+            $items[] = [
+                'index' => $index,
+                'html' => $this->dom->saveHTML($node),
+                'inner_html' => $this->getInnerHtml($node)
+            ];
+        }
+
+        return $items;
+    }
+
+    // 提取并格式化所有数据
+    public function extractAll() {
+        return [
+            'exam_type' => [
+                'html' => $this->getExamTypeHtml(true),
+                'links' => $this->getExamTypeLinks()
+            ],
+            'news_type' => [
+                'html' => $this->getNewsTypeHtml()
+            ],
+            'notice_list' => [
+                'html' => $this->getNoticeListHtml(true),
+                'items_html' => $this->getNoticeListHtml(false),
+                'items' => $this->getNoticeItems()
+            ],
+            'pagination' => [
+                'html' => $this->getPaginationHtml()
+            ],
+            'sidebar' => [
+                'html' => $this->getSidebarHtml(),
+                'hot_exam' => $this->getHotExamHtml(),
+                'hot_job' => $this->getHotJobHtml()
+            ]
+        ];
+    }
+
+}
+
+// 使用示例
+$html = './list.html'; // 这里放入您提供的HTML
+
+$extractor = new HtmlExtractor($html);
+
+// 1. 获取所有提取的数据
+$data = $extractor->getExamTypeHtml();
+
+var_dump($data);die;
+
+// 2. 分别保存各部分HTML
+file_put_contents('exam_type_full.html', $data['exam_type']['html']);
+file_put_contents('news_type.html', $data['news_type']['html']);
+file_put_contents('notice_list.html', $data['notice_list']['html']);
+file_put_contents('notice_items_only.html', $data['notice_list']['items_html']);
+file_put_contents('pagination.html', $data['pagination']['html']);
+
+// 3. 生成完整的HTML页面
+//$fullPage = $extractor->generateHtmlPage();
+//file_put_contents('extracted_data.html', $fullPage);
+//
+//// 4. 输出各部分信息
+//echo "=== 提取完成 ===\n";
+//echo "考试类型HTML长度: " . strlen($data['exam_type']['html']) . " 字节\n";
+//echo "资讯类型HTML长度: " . strlen($data['news_type']['html']) . " 字节\n";
+//echo "公告列表HTML长度: " . strlen($data['notice_list']['html']) . " 字节\n";
+//echo "公告项数量: " . count($data['notice_list']['items']) . " 条\n";
+//echo "考试类型链接数量: " . count($data['exam_type']['links']) . " 个\n\n";
+//
+//echo "已保存以下文件：\n";
+//echo "- exam_type_full.html (考试类型完整HTML)\n";
+//echo "- news_type.html (资讯类型HTML)\n";
+//echo "- notice_list.html (公告列表完整HTML)\n";
+//echo "- notice_items_only.html (仅公告项目HTML)\n";
+//echo "- pagination.html (分页HTML)\n";
+//echo "- extracted_data.html (完整提取结果页面)\n";
+//
+//// 5. 查看提取的链接
+//echo "\n=== 考试类型链接 ===\n";
+//foreach ($data['exam_type']['links'] as $link) {
+//    echo "- " . $link['text'] . ": " . $link['href'] . "\n";
+//}
+//
+//// 6. 输出部分公告HTML作为预览
+//echo "\n=== 公告预览（前3条） ===\n";
+//for ($i = 0; $i < min(3, count($data['notice_list']['items'])); $i++) {
+//    echo "【公告 " . ($i + 1) . "】\n";
+//    echo htmlspecialchars(substr($data['notice_list']['items'][$i]['html'], 0, 200)) . "...\n\n";
+//}
+?>
\ No newline at end of file
--- a/public/uploads/tianjin/321.png
+++ b/public/uploads/tianjin/321.png
--- a/public/uploads/tianjin/322.png
+++ b/public/uploads/tianjin/322.png
--- a/public/uploads/tianjin/323.png
+++ b/public/uploads/tianjin/323.png
--- a/public/uploads/tianjin/324.png
+++ b/public/uploads/tianjin/324.png
--- a/public/uploads/tianjin/325.png
+++ b/public/uploads/tianjin/325.png
--- a/public/uploads/tianjin/326.png
+++ b/public/uploads/tianjin/326.png
--- a/public/uploads/tianjin/327.png
+++ b/public/uploads/tianjin/327.png