Commit 9ba65111 by 白满斌

爬取

parent 1024dc56
<?php
namespace App\Console\Commands;
use App\Models\LeidaModel;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\CookieJar;
use Illuminate\Support\Facades\Storage;
use Illuminate\Support\Facades\Log;
use App\Http\Services\BaseService;
class BatchDetail extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'leida:detail
{id : 要爬取的id}
{max : 要爬取的max}';
/**
* The console command description.
*
* @var string
*/
protected $description = '爬取数据';
protected $logFile = './paqu.txt';
protected $frontUserModel;
protected $orderModel;
protected $dealRecordModel;
//省-市-0-考试类型-招考公告
public static $ksType = [
// '公务员' => '-0-0-2-124',
// "事业单位" => "-0-0-3-124",
// "教师" => "-0-0-59-124",
"医疗" => "-0-0-60-124",
"选调" => "-0-0-7-124",
"遴选" => "-0-0-63-124",
"选调生" => "-0-0-62-124",
"三支一扶" => "-0-0-8-124",
"大学生村官" => "-0-0-9-124",
"基层工作者" => "-0-0-66-124",
"银行" => "-0-0-67-124",
"国企" => "-0-0-78-124",
"公益性岗位" => "-0-0-80-124",
"军队文职" => "-0-0-249-124",
];
public static $ksArea = [
3510 => "国家",
1117 => "安徽",
1 => "北京",
1255 => "福建",
2129 => "广东",
3191 => "甘肃",
2290 => "广西",
2723 => "贵州",
37 => "河北",
1849 => "湖北",
705 => "黑龙江",
1654 => "河南",
2429 => "海南",
1979 => "湖南",
627 => "吉林",
878 => "江苏",
1359 => "江西",
498 => "辽宁",
374 => "内蒙古",
3357 => "宁夏",
3304 => "青海",
2500 => "四川",
1482 => "山东",
859 => "上海",
232 => "山西",
3063 => "陕西",
19 => "天津",
2980 => "西藏",
3390 => "新疆",
2826 => "云南",
1004 => "浙江",
2460 => "重庆",
3508 => "香港",
3509 => "澳门",
3507 => "台湾",
];
protected $client;
protected $jar;
protected $baseStoragePath = 'crawled_data';
/**
* Execute the console command.
*
* @return int
*/
public function handle()
{
$LeidaModel = new LeidaModel();
$id = $this->argument('id');
$max = $this->argument('max');
$this->client = new Client([
'timeout' => 30,
'verify' => false, // 注意:生产环境应设为 true
'allow_redirects' => [
'max' => 5,
'strict' => true,
'referer' => true,
'protocols' => ['http', 'https'],
],
]);
dump($id, $max);
$total = 0;
$LeidaModel->whereBetween('id', [$id, $max])->select(['*'])->chunkById(50, function ($list) use (&$LeidaModel) {
$nowTime = time();
$list = $list->toArray();
foreach ($list as $key => $value) {
$articleId = self::getLastNumberFromUrl($value['url']);
$str = 'appid=uqsFgLOVbuPrfn1v&articleId='.$articleId.'&from_device=h5&timestamp='.time().'&token=569fff2952c0317c29f3308df15cd75d11d15fmern1cdeol3f9tcfo31cdnb3o9o5yd03&userId=26239811';
$signStr = md5(urlencode($str.'&secret=Hf6yn1JPb1QZxniWhIPv1IrHbWeLh2e8'));
$url = 'https://api.gongkaoleida.com/api/FrontBackSecure/article/detailUserPc?'.$str.'&sign='.$signStr;
try {
$refer = $url;
// 发送请求
$response = $this->client->request('GET', $url);
// 获取响应
$statusCode = $response->getStatusCode();
$content = json_decode($response->getBody()->getContents(), true);
if($statusCode != 200){
dd('状态码异常:'.$statusCode , $value['id'], $content);
}
if($content['code'] != 1){
dd($value['id'], $content);
}
$sourceData = $content['data']['articleInfo'] ?? [];
if(empty($sourceData)){
dd('数据异常:'.$value['id']);
}
$from_url = $sourceData['sourcePageUrl'];
$from_title = $sourceData['origin'];
$from_detail = json_encode($sourceData);
$updateData = [
'from_url' => $from_url,
'from_title' => $from_title,
'from_detail' => $from_detail,
];
$LeidaModel->updateData(['id'=>$value['id']], $updateData);
dump('done:'.$value['id']);
sleep(5);
} catch (\Exception $e) {
dd($e->getMessage(), $value['id']);
Log::error('Web crawler error: ' . $e->getMessage(), [
'url' => $url,
'error' => $e->getMessage()
]);
}
}
});
}
// 方法1:简单正则匹配
public static function getLastNumberFromUrl($url) {
// 使用正则匹配最后一个数字
if (preg_match('/\/(\d+)(?:\?.*)?$/', $url, $matches)) {
return (int)$matches[1];
}
return null;
}
// 记录日志
protected function log($msg)
{
file_put_contents($this->logFile, trim($msg) . "\n", FILE_APPEND);
}
}
<?php
namespace App\Console\Commands;
use App\Models\DealRecord;
use App\Models\DealRecordGoods;
use App\Models\DealRecordUser;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Overtrue\Pinyin\Pinyin;
class BatchGetDealRecordOrderNo extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'batch:get:dealRecord:orderNo';
/**
* The console command description.
*
* @var string
*/
protected $description = '批量获取交易记录订单号';
protected $logFile = '';
protected $dealRecordModel;
protected $dealRecordUserModel;
protected $dealRecordGoodsModel;
/**
* Execute the console command.
*
* @return int
*/
public function handle() {
$dealRecordModel = new DealRecord();
$dealRecordUserModel = new DealRecordUser();
$dealRecordGoodsModel = new DealRecordGoods();
$line = 1;
$fileName = '217.txt';
// 打开文件
$fp = fopen($fileName, "r");
$line = 1;
while(!feof($fp)) {
var_dump($line);
$row = fgets($fp);
$row = str_replace("\n", '', trim($row));
$data = explode(" ", $row);
if (empty($data)) {
break;
}
// 先获取学员信息
$dealRecordUserInfo = $dealRecordUserModel->selectDataWithField(['deal_record_no' => $data[0]], ['*']);
if (empty($dealRecordUserInfo)) {
$this->info($line . '行: 单号不存在');
$line++;
continue;
}
foreach ($dealRecordUserInfo as $userInfo) {
// 匹配交易记录商品信息
$dealRecordGoodsInfo = $dealRecordGoodsModel->findDataWithField(['deal_record_user_id' => $userInfo['id'], 'goods_name' => $data[1]]);
if (empty($dealRecordGoodsInfo)) {
continue;
}
var_dump($dealRecordGoodsInfo['order_no']);
$line++;
}
}
}
// 记录日志
protected function log($msg) {
file_put_contents($this->logFile, trim($msg) . "\n", FILE_APPEND);
}
protected function generateOrderNo($userName, $createdTime) {
$userNamePinyin = Pinyin::abbr($userName)->join('');
$cleaned = str_replace(['-', ' ', ':'], '', $createdTime);
$cleaned .= mt_rand(1000000000, 2000000000);
return $userNamePinyin . $cleaned;
}
}
<?php
namespace App\Console\Commands;
use App\Models\DealRecord;
use App\Models\DealRecordGoods;
use App\Models\DealRecordUser;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
class BatchUpdateDealRecordCourt extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'batch:update:dealRecord:court';
/**
* The console command description.
*
* @var string
*/
protected $description = '批量更新法院状态';
protected $logFile = '';
protected $dealRecordModel;
protected $dealRecordUserModel;
protected $dealRecordGoodsModel;
/**
* Execute the console command.
*
* @return int
*/
public function handle() {
$this->dealRecordModel = new DealRecord();
$this->dealRecordUserModel = new DealRecordUser();
$this->dealRecordGoodsModel = new DealRecordGoods();
$line = 1;
$dealRecordWhere = [
'deleted_at' => null,
'contain_court' => '',
];
$dealRecordORM = DB::table('yh_deal_record')->where($dealRecordWhere);
$dealRecordORM->chunkById(100, function($list) use(&$line) {
$dealRecordList = $list->toArray();
foreach ($dealRecordList as $dealRecordInfo) {
$dealRecordInfo = (array)$dealRecordInfo;
$updateData = [
'contain_court' => '',
];
// 行号+1并打印显示进度
var_dump($line);
$line++;
// 查询当前交易记录的学员信息
$dealUserWhere = [];
$dealUserWhere['deal_record_id'] = $dealRecordInfo['id'];
$dealUserInfo = $this->dealRecordUserModel->selectDataWithField($dealUserWhere, ['*']);
$allCourt = [];
foreach ($dealUserInfo as $dealUser) {
$allCourt[] = $dealUser['is_court'];
}
$allCourt = array_unique($allCourt);
if (!empty($allCourt)) {
$updateData['contain_court'] = implode('%%', $allCourt);
}
$updateRes = $this->dealRecordModel->updateData(['id' => $dealRecordInfo['id']], $updateData);
if (false === $updateRes) {
var_dump('第' . $line . '行更新失败');
}
}
});
}
// 记录日志
protected function log($msg) {
file_put_contents($this->logFile, trim($msg) . "\n", FILE_APPEND);
}
}
<?php
namespace App\Console\Commands;
use App\Models\DealRecord;
use App\Models\DealRecordGoods;
use App\Models\DealRecordUser;
use Illuminate\Console\Command;
use Illuminate\Support\Facades\DB;
use Overtrue\Pinyin\Pinyin;
class BatchUpdateDealRecordOrderNo extends Command
{
/**
* The name and signature of the console command.
*
* @var string
*/
protected $signature = 'batch:update:dealRecord:orderNo';
/**
* The console command description.
*
* @var string
*/
protected $description = '批量更新交易记录订单号';
protected $logFile = '';
protected $dealRecordModel;
protected $dealRecordUserModel;
protected $dealRecordGoodsModel;
/**
* Execute the console command.
*
* @return int
*/
public function handle() {
$this->dealRecordModel = new DealRecord();
$this->dealRecordUserModel = new DealRecordUser();
$this->dealRecordGoodsModel = new DealRecordGoods();
$line = 1;
$dealRecordWhere = [
'deleted_at' => null,
'contain_order_no' => '',
];
$dealRecordORM = DB::table('yh_deal_record')->where($dealRecordWhere);
$dealRecordORM->chunkById(100, function($list) use(&$line) {
$dealRecordList = $list->toArray();
foreach ($dealRecordList as $dealRecordInfo) {
$dealRecordInfo = (array)$dealRecordInfo;
$updateData = [
'contain_order_no' => '',
];
// 行号+1并打印显示进度
var_dump($line);
$line++;
// 查询当前交易记录的抵扣信息
$dealGoodsWhere = [];
$dealGoodsWhere['deal_record_id'] = $dealRecordInfo['id'];
$dealGoodsInfo = $this->dealRecordGoodsModel->selectDataWithField($dealGoodsWhere, ['*']);
$allOrderNo = [];
foreach ($dealGoodsInfo as $dealGoods) {
if ($dealGoods['order_no'] === '') {
// 重新生成新的订单号
$newOrderNo = $this->generateOrderNo($dealRecordInfo['user_name'], $dealRecordInfo['created_at']);
// 更新
$updateRes = $this->dealRecordGoodsModel->updateData(['id' => $dealGoods['id']], ['order_no' => $newOrderNo]);
if (false === $updateRes) {
var_dump('第' . $line . '行抵扣信息更新失败');
continue;
}
$allOrderNo[] = $newOrderNo;
} else {
$allOrderNo[] = $dealGoods['order_no'];
}
}
$allOrderNo = array_unique($allOrderNo);
if (!empty($allOrderNo)) {
$updateData['contain_order_no'] = implode('%%', $allOrderNo);
}
$updateRes = $this->dealRecordModel->updateData(['id' => $dealRecordInfo['id']], $updateData);
if (false === $updateRes) {
var_dump('第' . $line . '行更新失败');
}
}
});
}
// 记录日志
protected function log($msg) {
file_put_contents($this->logFile, trim($msg) . "\n", FILE_APPEND);
}
protected function generateOrderNo($userName, $createdTime) {
$userNamePinyin = Pinyin::abbr($userName)->join('');
$cleaned = str_replace(['-', ' ', ':'], '', $createdTime);
$cleaned .= mt_rand(1000000000, 2000000000);
return $userNamePinyin . $cleaned;
}
}
<?php
namespace App\Console\Commands;
use Illuminate\Console\Command;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\CookieJar;
use Illuminate\Support\Facades\Cache;
class test extends Command
{
protected $signature = 'crawl:dynamic-token';
protected $description = '使用动态Token管理爬取公考雷达';
private $client;
private $baseUrl = 'https://www.gongkaoleida.com';
private $cookieJar;
private $sessionData = [];
public function handle()
{
// https://api.gongkaoleida.com/api/FrontBackSecure/article/detailUserPc?appid=uqsFgLOVbuPrfn1v&articleId=2716421&from_device=h5&timestamp=1765185164&token=569fff2952c0317c29f3308df15cd75d11d15fmern1cdeol3f9tcfo31cdnb3o9o5yd03&userId=26239811&sign=a11b430c7a5610930bd603b7c6d05df5
// https://api.gongkaoleida.com/api/FrontBackSecure/article/detailUserPc?appid=uqsFgLOVbuPrfn1v&articleId=2455556&from_device=h5&timestamp=1765185842&token=c1c6a9fdf0bea1fe6a46314e3dd8c17111d156qkkr1cdejzbb1znjle1cdn6hw5gc9gxe&userId=11317419&sign=ebf4a4832ade58f2a45b772d273a838c
// https://api.gongkaoleida.com/api/FrontBackSecure/article/detailUserPc?appid=uqsFgLOVbuPrfn1v&articleId=2305006&from_device=h5&timestamp=1765251531&token=569fff2952c0317c29f3308df15cd75d11d15fmern1cdeol3f9tcfo31cdnb3o9o5yd03&userId=26239811&sign=31b7c0372a445598bbbe40aabb987acc
$s = 'appid=uqsFgLOVbuPrfn1v&articleId=2305006&from_device=h5&timestamp=1765251531&token=569fff2952c0317c29f3308df15cd75d11d15fmern1cdeol3f9tcfo31cdnb3o9o5yd03&userId=26239811';
$s .= '&secret=Hf6yn1JPb1QZxniWhIPv1IrHbWeLh2e8';
dd(md5(urlencode($s)), $s);
dd(333);
$this->initializeClient();
// 1. 启动会话
$this->startSession();
// 2. 爬取目标页面
$url = $this->argument('url');
$result = $this->smartRequest($url);
dd($result);
if ($result['success']) {
$this->processResult($result);
}
return 0;
}
private function initializeClient(): void
{
$this->cookieJar = new CookieJar();
$this->client = new Client([
'base_uri' => $this->baseUrl,
'cookies' => $this->cookieJar,
'timeout' => 30,
'headers' => [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' => 'zh-CN,zh;q=0.9',
'Accept-Encoding' => 'gzip, deflate, br',
],
]);
}
/**
* 启动会话
*/
private function startSession(): void
{
$this->info('启动会话...');
// 访问首页获取初始token
$response = $this->client->get('/');
// 提取token
$this->extractTokensFromResponse($response);
// 访问API端点激活会话
$this->activateSession();
$this->info('会话启动完成');
}
/**
* 从响应中提取token
*/
private function extractTokensFromResponse($response): void
{
// 从Cookie获取
$cookies = $this->cookieJar->toArray();
foreach ($cookies as $cookie) {
if (isset($cookie['Name'])) {
$name = $cookie['Name'];
$value = $cookie['Value'] ?? '';
if ($name === 'XSRF-TOKEN') {
$this->sessionData['xsrf_token'] = $value;
$this->sessionData['xsrf_decoded'] = urldecode($value);
} elseif ($name === 'gkld_session') {
$this->sessionData['session'] = $value;
}
}
}
// 从HTML中获取CSRF token
$body = $response->getBody()->getContents();
if (preg_match('/<meta name="csrf-token" content="([^"]+)"/', $body, $matches)) {
$this->sessionData['csrf_token'] = $matches[1];
}
// 记录token信息
$this->logTokenInfo();
}
/**
* 激活会话
*/
private function activateSession(): void
{
try {
// 发送一个AJAX请求,模拟用户活动
$response = $this->client->get('/api/session/keepalive', [
'headers' => [
'X-Requested-With' => 'XMLHttpRequest',
'X-CSRF-TOKEN' => $this->sessionData['csrf_token'] ?? '',
'X-XSRF-TOKEN' => $this->sessionData['xsrf_decoded'] ?? '',
],
]);
// 更新token
$this->extractTokensFromResponse($response);
} catch (\Exception $e) {
// 忽略错误,不是所有网站都有这个端点
}
}
/**
* 智能请求(自动处理token刷新)
*/
private function smartRequest(string $url, int $maxRetries = 3): array
{
$retryCount = 0;
while ($retryCount < $maxRetries) {
try {
// 确保token有效
$this->ensureTokenValid();
$response = $this->client->get($url, [
'headers' => $this->getRequestHeaders($url),
]);
dd($url, $response->getBody()->getContents());
// 更新token
$this->extractTokensFromResponse($response);
return [
'success' => true,
'status' => $response->getStatusCode(),
'content' => $response->getBody()->getContents(),
];
} catch (\Exception $e) {
$retryCount++;
// 如果是token相关错误,刷新token
if ($this->isTokenError($e)) {
$this->refreshToken();
}
if ($retryCount >= $maxRetries) {
return [
'success' => false,
'error' => $e->getMessage(),
];
}
sleep(pow(2, $retryCount)); // 指数退避
}
}
return ['success' => false, 'error' => 'Max retries exceeded'];
}
/**
* 确保token有效
*/
private function ensureTokenValid(): void
{
$lastUpdate = Cache::get('token_last_update', 0);
$interval = $this->option('interval');
if (time() - $lastUpdate >= $interval) {
$this->refreshToken();
}
}
/**
* 刷新token
*/
private function refreshToken(): void
{
$this->info('刷新token...');
try {
// 访问一个轻量级页面
$response = $this->client->get('/');
$this->extractTokensFromResponse($response);
Cache::put('token_last_update', time(), 300);
$this->info('Token刷新完成');
} catch (\Exception $e) {
$this->error('刷新token失败: ' . $e->getMessage());
}
}
/**
* 获取请求头
*/
private function getRequestHeaders(string $url): array
{
$headers = [
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' => 'zh-CN,zh;q=0.9',
'Accept-Encoding' => 'gzip, deflate, br',
'Referer' => $this->baseUrl . '/',
];
// 添加CSRF token
if (!empty($this->sessionData['csrf_token'])) {
$headers['X-CSRF-TOKEN'] = $this->sessionData['csrf_token'];
}
// 添加XSRF token
if (!empty($this->sessionData['xsrf_decoded'])) {
$headers['X-XSRF-TOKEN'] = $this->sessionData['xsrf_decoded'];
}
return $headers;
}
/**
* 检查是否是token错误
*/
private function isTokenError(\Exception $e): bool
{
$message = $e->getMessage();
return strpos($message, '419') !== false ||
strpos($message, 'CSRF') !== false ||
strpos($message, 'Token') !== false;
}
/**
* 记录token信息
*/
private function logTokenInfo(): void
{
$info = [];
if (isset($this->sessionData['csrf_token'])) {
$info['CSRF Token'] = substr($this->sessionData['csrf_token'], 0, 20) . '...';
}
if (isset($this->sessionData['xsrf_token'])) {
$info['XSRF Token'] = substr($this->sessionData['xsrf_token'], 0, 20) . '...';
}
if (isset($this->sessionData['session'])) {
$info['Session'] = substr($this->sessionData['session'], 0, 20) . '...';
}
$this->info('当前Token状态:');
foreach ($info as $key => $value) {
$this->line(" {$key}: {$value}");
}
}
/**
* 处理结果
*/
private function processResult(array $result): void
{
$this->info('爬取成功!');
$this->info("状态码: {$result['status']}");
$this->info("内容长度: " . strlen($result['content']) . " 字节");
if ($this->option('save')) {
$this->saveResult($result['content']);
}
}
/**
* 保存结果
*/
private function saveResult(string $content): void
{
$filename = storage_path('app/crawled/' . date('Ymd_His') . '.html');
file_put_contents($filename, $content);
$this->info("结果已保存到: {$filename}");
}
}
...@@ -5,7 +5,10 @@ namespace App\Http\Services; ...@@ -5,7 +5,10 @@ namespace App\Http\Services;
use App\Http\Controllers\Controller; use App\Http\Controllers\Controller;
use App\Models\Config; use App\Models\Config;
use Ramsey\Uuid\Uuid; use Ramsey\Uuid\Uuid;
use Godruoyi\Snowflake\Snowflake;; use Godruoyi\Snowflake\Snowflake;
use App\Models\LeidaModel;
class BaseService extends Controller class BaseService extends Controller
{ {
...@@ -44,7 +47,8 @@ class BaseService extends Controller ...@@ -44,7 +47,8 @@ class BaseService extends Controller
return $configInfo['config_value'] ?? ''; return $configInfo['config_value'] ?? '';
} }
protected function buildTree($list, $pid = 0) { protected function buildTree($list, $pid = 0)
{
$tree = []; $tree = [];
foreach ($list as $data) { foreach ($list as $data) {
...@@ -57,15 +61,165 @@ class BaseService extends Controller ...@@ -57,15 +61,165 @@ class BaseService extends Controller
return $tree; return $tree;
} }
protected function getUUid() { protected function getUUid()
{
return Uuid::uuid4()->toString(); return Uuid::uuid4()->toString();
} }
protected function getSnowflake() { protected function getSnowflake()
{
$snowflake = new Snowflake(1, 1); $snowflake = new Snowflake(1, 1);
$snowflake->setStartTimeStamp(strtotime('2024-01-01')*1000); $snowflake->setStartTimeStamp(strtotime('2024-01-01') * 1000);
return $snowflake->id(); return $snowflake->id();
} }
public static function htmlExplain($requestUrl, $html, $areaName)
{
// $html = file_get_contents( './list.html');
$LeidaModel = new LeidaModel();
$dom = new \DOMDocument();
@$dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
$xpath = new \DOMXPath($dom);
// echo "=== 考试类型链接 ===\n";
$examNodes = $xpath->query("//dl[dt='考试类型:']//ul[@class='type-list']/li/a");
// foreach ($examNodes as $node) {
// echo trim($node->nodeValue) . ": " . $node->getAttribute('href') . "\n";
// }
// echo "\n=== 公告列表 ===\n";
$noticeNodes = $xpath->query("//div[@class='mdn-content-l']//ul[@class='link-list']/li");
// 最佳方法:查找主内容区的公告列表
$mainContent = $xpath->query("//div[contains(@class, 'mdn-content-l')]//ul[@class='link-list']/li");
// echo "主内容区公告数量: " . $mainContent->length . "\n";
$allData = [];
if ($mainContent->length > 0) {
// 提取总页数(从尾页链接)
$total_pages = 0;
$lastPageNode = $xpath->query("//a[contains(text(), '尾页')]");
if ($lastPageNode->length > 0) {
$href = $lastPageNode->item(0)->getAttribute('href');
if (preg_match('/page=(\d+)/', $href, $matches)) {
$total_pages = (int)$matches[1];
}
}
$insertData = [];
$dateFlag = 0;
foreach ($mainContent as $index => $node) {
// 提取所有标签
$labels = [];
$labelNodes = $xpath->query(".//i[@class='notice-label']", $node);
foreach ($labelNodes as $labelNode) {
$label = trim($labelNode->nodeValue);
$label = str_replace(['[', ']'], '', $label);
$labels[] = $label;
}
// 提取标题和链接
$linkNode = $xpath->query(".//a", $node)->item(0);
if ($linkNode) {
$title = trim($linkNode->nodeValue);
$url = $linkNode->getAttribute('href');
$labels['title'] = $title;
$labels['url'] = $url;
// 提取日期
$timeNode = $xpath->query(".//time", $node)->item(0);
$date = $timeNode ? trim($timeNode->nodeValue) : '';
$labels['date'] = $date;
// // 格式化输出
// if (count($labels) >= 2) {
// // 通常第一个是地区,第二个是考试类型
// echo "[{$labels[0]}]-[{$labels[1]}]-$title:$url\n";
// } elseif (count($labels) == 1) {
// echo "[{$labels[0]}]-[未知]-$title:$url\n";
// } else {
// echo "[未知]-[未知]-$title:$url\n";
// }
}
//时间超过2024年
if(strtotime($labels['date']) && strtotime($labels['date']) <= strtotime('2024-11-01')){
$dateFlag = 1;
continue;
}
$allData[] = $labels;
$insertData[] = [
'param'=>$requestUrl,
'province'=>$areaName,
'area'=>$labels[0],
'type'=>$labels[1],
'title'=>$labels['title'],
'url'=>$labels['url'],
'stime'=>$labels['date'],
'content'=>json_encode($labels),
];
}
$LeidaModel->insertMultiData($insertData);
if($dateFlag){
return ['allData' => $allData, 'total_pages' => $total_pages, 'continue'=>true];
}
return ['allData' => $allData, 'total_pages' => $total_pages, 'continue'=>false];
} else {
return ['allData' => [], 'total_pages' => 0, 'continue'=>false];
// 备用方法:查找所有可能包含公告的li
$allLi = $xpath->query("//li");
$noticeCount = 0;
foreach ($allLi as $node) {
// 检查是否包含notice-label
$labelNodes = $xpath->query(".//i[@class='notice-label']", $node);
$linkNode = $xpath->query(".//a", $node)->item(0);
if ($labelNodes->length > 0 && $linkNode) {
$noticeCount++;
// 提取标签
$labels = [];
foreach ($labelNodes as $labelNode) {
$label = trim($labelNode->nodeValue);
$label = str_replace(['[', ']'], '', $label);
$labels[] = $label;
}
$title = trim($linkNode->nodeValue);
$url = $linkNode->getAttribute('href');
$labels['title'] = $title;
$labels['url'] = $url;
if (count($labels) >= 2) {
echo "[{$labels[0]}]-[{$labels[1]}]-$title:$url\n";
}
}
}
echo "通过备用方法找到公告数量: $noticeCount\n";
}
var_dump($allData);
die;
}
} }
<?php
/**
* Created by PhpStorm.
* User: bmb369
* Date: 2024-05-07
* Time: 14:20
*/
namespace App\Models;
use Illuminate\Database\Eloquent\SoftDeletes;
class LeidaModel extends Base {
protected $table = 'gz_leida';
use SoftDeletes;
protected $dates = ['deleted_at'];
protected $primaryKey = 'id';
public function getInfo($id)
{
$data = $this->select($this->table.'.'.'*')
->where([$this->table.'.id'=>$id])
->first();
if (empty($data)) {
return [];
} else {
return $data->toArray();
}
}
}
\ No newline at end of file
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-59-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=1
未爬取到数据:
未爬取到数据:
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-3-124?page=46
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-59-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-60-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-7-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-63-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-62-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-8-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-9-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-66-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-67-124?page=1
未爬取到数据:https://www.gongkaoleida.com/area/3510-0-0-78-124?page=1
This source diff could not be displayed because it is too large. You can view the blob instead.
<?php <?php
$html = file_get_contents( './list.html'); //$html = file_get_contents( './list.html');
$token = "eyJpdiI6IlwvZDFiWk9rR3pDcU5hOWdHMWxwbXZ3PT0iLCJ2YWx1ZSI6IjRwYkUralJEbHBhZEg2S2tnOUV5citXVFd4TWY0TkxpVXllSE1BS1JMR0sxd3lcL0dYeVRKSkxvcitJb1ZFZFk5IiwibWFjIjoiMDEyODIyMTFhZDU3YjI1ODM4MmFhMmYxZGE4YjVhOGRjZjEzZmNlMzViMmExMDI3YjljMWNmOTYxMzhlMTM1YyJ9";
// Base64 解码
$decoded = base64_decode($token);
var_dump($decoded);die;
$dom = new DOMDocument(); $dom = new DOMDocument();
@$dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING); @$dom->loadHTML($html, LIBXML_NOERROR | LIBXML_NOWARNING);
$xpath = new DOMXPath($dom); $xpath = new DOMXPath($dom);
...@@ -17,6 +26,19 @@ $noticeNodes = $xpath->query("//div[@class='mdn-content-l']//ul[@class='link-lis ...@@ -17,6 +26,19 @@ $noticeNodes = $xpath->query("//div[@class='mdn-content-l']//ul[@class='link-lis
// 最佳方法:查找主内容区的公告列表 // 最佳方法:查找主内容区的公告列表
$mainContent = $xpath->query("//div[contains(@class, 'mdn-content-l')]//ul[@class='link-list']/li"); $mainContent = $xpath->query("//div[contains(@class, 'mdn-content-l')]//ul[@class='link-list']/li");
echo "主内容区公告数量: " . $mainContent->length . "\n"; echo "主内容区公告数量: " . $mainContent->length . "\n";
// 提取总页数(从尾页链接)
$lastPageNode = $xpath->query("//a[contains(text(), '尾页')]");
if ($lastPageNode->length > 0) {
$href = $lastPageNode->item(0)->getAttribute('href');
if (preg_match('/page=(\d+)/', $href, $matches)) {
$total_pages = (int)$matches[1];
}
}
var_dump($total_pages);
$allData = []; $allData = [];
if ($mainContent->length > 0) { if ($mainContent->length > 0) {
foreach ($mainContent as $index => $node) { foreach ($mainContent as $index => $node) {
...@@ -44,6 +66,16 @@ if ($mainContent->length > 0) { ...@@ -44,6 +66,16 @@ if ($mainContent->length > 0) {
$date = $timeNode ? trim($timeNode->nodeValue) : ''; $date = $timeNode ? trim($timeNode->nodeValue) : '';
$labels['date'] = $date; $labels['date'] = $date;
//时间超过2024年
var_dump($labels['date']);
var_dump(strtotime($labels['date']));
var_dump(strtotime('2024-11-01'));
if(strtotime($labels['date']) && strtotime($labels['date']) <= strtotime('2024-11-01')){
$dateFlag = 1;
var_dump(333);
}
// // 格式化输出 // // 格式化输出
// if (count($labels) >= 2) { // if (count($labels) >= 2) {
// // 通常第一个是地区,第二个是考试类型 // // 通常第一个是地区,第二个是考试类型
...@@ -60,7 +92,7 @@ if ($mainContent->length > 0) { ...@@ -60,7 +92,7 @@ if ($mainContent->length > 0) {
} }
} else { } else {
echo "未找到公告列表!尝试备用方法...\n"; dd( "未找到公告列表!尝试备用方法...");
// 备用方法:查找所有可能包含公告的li // 备用方法:查找所有可能包含公告的li
$allLi = $xpath->query("//li"); $allLi = $xpath->query("//li");
...@@ -94,7 +126,7 @@ if ($mainContent->length > 0) { ...@@ -94,7 +126,7 @@ if ($mainContent->length > 0) {
echo "通过备用方法找到公告数量: $noticeCount\n"; echo "通过备用方法找到公告数量: $noticeCount\n";
} }
var_dump($allData);die; //var_dump($allData);die;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment