记一次爬取豆瓣电影详情(PHP)

sixener · 发表于 2019-2-13 13:17

帮朋友爬取豆瓣电影的介绍里面的内容,给他出个接口让他爬取，废话不多说了，上代码

简单的爬取分为两个文件

fectch.php

<?php
require "./getfunction.php";
$name = "复仇者联盟3：无限战争";
$url = "https://movie.douban.com/j/subject_suggest?q=".$name;
$curl = curl_init(); // 启动一个CURL会话
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在
$tmpInfo = curl_exec($curl);     //返回api的json对象
$tmpInfo = json_decode($tmpInfo);

// var_dump($tmpInfo);die;
$arrat_res = [];
foreach ($tmpInfo as $v) {
    if ($name == $v->title) {
        $arrat_res[] = $v;
    }
}
if (empty($arrat_res)) {
   $data = [
     "code"=>10001,
     "msg"=>"暂无片源信息"
   ];
   echo json_encode($data);die;
}
$url2 = $arrat_res[0]->url;
curl_setopt($curl, CURLOPT_URL, $url2);
curl_setopt($curl, CURLOPT_HEADER, 0);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false); // 跳过证书检查
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);  // 从证书中检查SSL加密算法是否存在
$tmpInfo2 = curl_exec($curl);     //返回api的json对象

if (!$tmpInfo2) {
    echo "<br />cURL error number:" .curl_errno($curl);
    echo "<br />cURL error:" . curl_error($curl);
    exit;
}
//创建一个DomDocument对象，用于处理一个HTML
$dom = new DOMDocument();
//从一个字符串加载HTML
@$dom->loadHTML($tmpInfo2);
//使该HTML规范化
$dom->normalize();

//用DOMXpath加载DOM，用于查询
$xpath = new DOMXPath($dom);

//获取导演信息
$directors = $xpath->evaluate("//*[@id='info']/span[1]/span[2]/a/text()");
$directors_res = "";
for ($i = 0; $i < $directors->length; $i++) {
    $director = $directors->item($i);
    $director = $director->nodeValue;
    if ($i != 0) {
      $directors_res = $directors_res.",".$director;
    }else{
      $directors_res = $director;
    }

}

//名称
$name = $xpath->evaluate("//*[@id='content']/h1/span[1]/text()");
if (!empty($name->length)) {
    $name = $name->item(0)->nodeValue;
}

//年份
$years = $xpath->evaluate("//*[@id='content']/h1/span[2]/text()");
if (!empty($years->length)) {
  $years = $years->item(0)->nodeValue;
}

//海报
//*[@id="mainpic"]/a/img
$img = $xpath->evaluate("//*[@id='mainpic']/a/img/@src");
if (!empty($img->length)) {
    $img = $img->item(0)->nodeValue;
}
// var_dump($img);die;
//是否上映
//*[@id="interest_sectl"]/div/div[2]/div/div[2]
$is_on = $xpath->evaluate("//*[@id='interest_sectl']/div/div[2]/div/div[2]");
if (!empty($is_on->length)) {
    $is_on = $is_on->item(0)->nodeValue;
    if (trim($is_on) == "尚未上映") {
      $is_on = 1;
    }else{
      $is_on = 2;
    }
}
// var_dump($is_on);die;

//获取编剧信息
$screenwriters = $xpath->evaluate("//*[@id='info']/span[2]/span[2]/a/text()");
$screenwriters_res = "";
for ($i = 0; $i < $screenwriters->length; $i++) {

    $screenwriter = $screenwriters->item($i);
    $screenwriter = $screenwriter->nodeValue;
    if ($i != 0) {
       $screenwriters_res = $screenwriters_res. ",".$screenwriter;
    }else{
       $screenwriters_res = $screenwriter;
    }
}

//获取演员信息
//*[@id="info"]/span[3]/span[2]/span[1]/a
$actors = $xpath->query("//*[@id='info']/span[3]/span[2]");
$actors_res = "";
for ($i = 0; $i < $actors->length; $i++) {
    $actor = $actors->item($i);
    $actor = $actor->nodeValue;
    if ($i != 0) {
       $actors_res = $actors_res. ",".$actor;
    }else{
       $actors_res = $actor;
    }
}

// $types = $xpath->query("//*[@id='info']/span[30]");
// var_dump($types->item(0)->nodeValue);die;
//获取类型
$getfunction = new getFunction();
$sear_res = $getfunction->getRes(5,"制片国家/地区:",$xpath);
$types_res = $sear_res["res"];
$num = $sear_res["num"];

//获取语言
$attr = [];
$langs = $xpath->evaluate("//*[@id='info']/text()");
for ($i = 0; $i < $langs->length; $i++) {
    $lang = $langs->item($i);
    $lang = $lang->nodeValue;
    if (preg_match('/[\x{4e00}-\x{9fa5}]/u', $lang)>0) {
        $attr [] = $lang;
    }
}
// var_dump($attr);die;
// if (count($attr) == 3) {
//   // code...
// }
if ($is_on == 1) {
  $show_res = "";
  $sear2_res = $getfunction->getRes($num+4,"又名:",$xpath);
  $time_res = $sear2_res["res"];
  $num = $sear2_res["num"];

}else{
  //获取上映时间
  $sear2_res = $getfunction->getRes($num+4,"片长:",$xpath);
  $time_res = $sear2_res["res"];
  $num = $sear2_res["num"];

  //时长
  $sear3_res = $getfunction->getRes($num+1,"又名:",$xpath);
  $show_res = $sear3_res["res"];
  $num = $sear3_res["num"];
}

if (count($attr) == 4) {
  $show_res = $show_res.$attr[2];
  $country = $attr[0];
  $languages = $attr[1];
  $byname = $attr[3];
}else{
  $country = $attr[0];
  $languages = $attr[1];
  $byname = $attr[2];
}

$imbd = "";
$urlim = $xpath->evaluate("//*[@id='info']/a[2]/@href");
if (!empty($urlim->length)) {
  $urlim = $urlim->item(0)->nodeValue;
  //获取url
  $urls = "";
  $urls = $xpath->evaluate("//*[@id='info']/a[1]/@href");
  if (!empty($urls->length)) {
    $urls = $urls->item(0)->nodeValue;
  }
}else{
  $urls = "";
  $urlim = $xpath->evaluate("//*[@id='info']/a[1]/@href");
  if (!empty($urlim->length)) {
    $urlim = $urlim->item(0)->nodeValue;
  }
}

$final_res = [
  "all_name" => $name.$years,
  "name" => $name,
  "year" => $years,
  "img" => $img,
  "directors" => $directors_res,
  "screenwriters" => $screenwriters_res,
  "actors" => $actors_res,
  "types" => $types_res,
  "web_url" => $urls,
  "country" => $country,
  "languages" => $languages,
  "ontime" => $time_res,
  "showtime" => $show_res,
  "byname" => $byname,
  "imbd" => $urlim
];

$return = ["code"=>0, "msg"=>"抓取成功", "data"=>$final_res ];
echo json_encode($return);

getfunction.php

<?php
class getFunction{
  public static function getRes($start,$key,$xpath){
    $res = "";
    $num = "";
    // $key = "官方网站:";
    for($i = $start; $i<30; $i++ ){
      $types = $xpath->query("//*[@id='info']/span[".$i."]");
      if (!empty($types->length)) {
        $info_res = $types->item(0)->nodeValue;
        if ($info_res == $key) {
          $num = $i;
        }elseif ($info_res == "官方网站:") {
          $num = $i;
        }else{
          if(empty($num)){
            if ($i != $start) {
               $res = $res. ",".$info_res;
            }else{
               $res = $info_res;
            }
          }
        }
      }
    }
    $data = ["res"=>$res,"num"=>$num];
    return $data;
  }
}

效果图

sixener · 发表于 2019-2-14 11:21

iamasd 发表于 2019-2-14 10:26
能搞到豆瓣电影的评论么？

也可以用xpath但是只能取一部分后面的就都是一样的了，但是你可以实时的定时去爬取还是不错的，建议用代{过}{滤}理去挂着爬，我一开始是用python，但是需求是得用php，只能给我朋友写个PHP了求给个热心哈哈

sixener · 发表于 2019-3-19 21:09

孟坤软件发表于 2019-3-19 18:23
楼主把问题弄复杂了，豆瓣直接提供了接口的……
[mw_shl_code=html,true]https://api.douban.com/v2/movie ...

我的那个时候可能没有前后分离的

sixener · 发表于 2019-2-13 14:49

打破自己

iamasd · 发表于 2019-2-13 16:49

兄弟，用php撸爬虫也是优秀哦

sixener · 发表于 2019-2-13 21:36

iamasd 发表于 2019-2-13 16:49
兄弟，用php撸爬虫也是优秀哦

xpath还是很简单的

iamasd · 发表于 2019-2-14 10:26

能搞到豆瓣电影的评论么？

iamasd · 发表于 2019-2-15 10:06

是的，我之前还研究php怎么和python搞一搞

sixener · 发表于 2019-2-15 10:37

iamasd 发表于 2019-2-15 10:06
是的，我之前还研究php怎么和python搞一搞

python的定时是极好的

maiawpyg · 发表于 2019-2-16 12:23

楼主能私聊不帮点忙

sixener · 发表于 2019-2-16 13:12

maiawpyg 发表于 2019-2-16 12:23
楼主能私聊不帮点忙

怎么了？

帐号		自动登录	找回密码
密码			注册[Register]

[其他原创] 记一次爬取豆瓣电影详情(PHP)

效果图

免费评分

个人中心