• 2011-08-31

    php版的舆情监控系统 - [技术空间]

    版权声明:转载时请以超链接形式标明文章原始出处和作者信息及本声明
    http://www.blogbus.com/lily64-logs/158842076.html

    php版的舆情监控系统。可以设置关键词和目标网址,进行标题和全文搜索。设置3个关键词,4个目标网址,全文搜索所需时间大约8分钟。代码存档。

    common.php
    <?php
    header("content-type:text/html; charset=gbk");
    set_time_limit(0);
    // Returns true if $string is valid UTF-8 and false otherwise.
    function is_utf8($word)
    {
    if (preg_match("/^([".chr(228)."-".chr(233)."]{1}[".chr(128)."-".chr(191)."]{1}[".chr(128)."-".chr(191)."]{1}){1}/",$word) == true || preg_match("/([".chr(228)."-".chr(233)."]{1}[".chr(128)."-".chr(191)."]{1}[".chr(128)."-".chr(191)."]{1}){1}$/",$word) == true || preg_match("/([".chr(228)."-".chr(233)."]{1}[".chr(128)."-".chr(191)."]{1}[".chr(128)."-".chr(191)."]{1}){2,}/",$word) == true)
    {
    return true;
    }
    else
    {
    return false;
    }
    } // function is_utf8

    function check($cs, $s1, $n){
    $cs = '@'.$cs;
    $cj = $x = $y = 0;
    for($ci = 0; $ci < strlen($cs); $ci++){
    if($index = strpos($cs, $s1, $y? ($y + 1):$y)){
    $cj++;
    if($cj == $n){
    $x = $index;
    break;
    }else{
    $y = $index;
    }
    }
    }
    return $x - 1;
    }
    $conn=mysql_connect("localhost","root","123456");
    $db = mysql_select_db("db_keywords", $conn);
    mysql_query("set names gbk;");
    ?>

    getkeylink.php
    <?php
    require_once("common.php");
    ?>
    <title>
    关键词搜刮系统
    </title>
    <style>
    li{width:80px;float:left;list-style:none;overflow:hidden;border-left:1px solid green;border-top:1px solid green;height:25px;line-height:25px;vertical-align:middle;padding-left:5px}
    ul{height:25px;width:1200px;padding:0;margin:0;clear:both}
    .li1{background:#F5DBDA;}
    img{border:0; vertical-align:middle;height:25px}
    </style>
    <?php
    $arr=array();
    $arr2=array();
    $namearr=array();

    if(isset($_POST["getlink"]))
    {
    $getlink=$_POST["getlink"];
    $linkname=$_POST["linkname"];
    foreach($getlink as $key=>$value)
    if($value!="")
    {
    array_push($arr,$value);
    if($linkname[$key]=="")
    $namevalue=$value;
    else
    $namevalue=$linkname[$key];
    array_push($namearr,$namevalue);
    }
    }
    else
    {
    $arr[0]="http://news.sina.com.cn";
    $arr[1]="http://news.163.com";
    $arr[2]="http://focus.tianya.cn";
    $arr[3]="http://news.ifeng.com";
    }

    if(isset($_POST["getkey"]))
    {
    $getkey=$_POST["getkey"];
    foreach($getkey as $key=>$value)
    if($value!="")
    array_push($arr2,$value);
    }
    else
    {
    $arr2[0]="政府";
    $arr2[1]="公民";
    $arr2[2]="人权";
    }
    ?>
    <script>
    breakit=false;
    function mysubmit(submittype)
    {
    document.getElementsByName("searchtype")[0].value=submittype;
    document.getElementById("myform").target="";
    document.getElementById("myform").action="";
    document.getElementById("myform").submit();
    }
    function mysavekey(submittype)
    {
    if(submittype==null)return;
    document.getElementsByName("savetype")[0].value=submittype;
    if(!confirm("all previous "+submittype+"s will be deleted first! are you sure?"))return;
    document.getElementById("myform").action="savekey.php";
    document.getElementById("myform").target="myifrm2";
    document.getElementById("myform").submit();
    }
    </script>
    <form method="post" id=myform style="padding-bottom:0;margin-bottom:0">
    <input type="hidden" name=searchtype value="" />
    <input type="hidden" name=savetype value="" />
    <input type=hidden id=currentlink />
    <?php
    echo "<div id=keydiv>keys:";
    foreach($arr2 as $k=>$v)
    echo "<input name=getkey[] value='".$v."' />&nbsp;";

    echo "<input name=getkey[]>&nbsp;";
    echo "</div>";

    echo "<div id=sitediv>";
    foreach($arr as $k=>$v)
    echo "site:<input size=60 name=getlink[] value='".$v."' />&nbsp;<br>";

    echo "site:<input size=60 name=getlink[]>&nbsp;<br>";
    echo "</div>";

    echo "<input type=submit onclick=mysubmit('title') value=get_title_info><input type=button onclick=mysubmit('txt') value=get_txt_info><input type=button onclick='breakit=true' value=break><input type=button onclick='breakit=false;window.frames[0].location.href=document.getElementById(\"currentlink\").value' value=continue><input type=button value=addkey onclick=\"document.getElementById('keydiv').innerHTML+='<input name=getkey[]>&nbsp;';\"><input type=button value=addsite onclick=\"document.getElementById('sitediv').innerHTML+='site:<input size=60 name=getlink[]>&nbsp;<br>';\"><input type=checkbox id=mychk checked=checked><label for=mychk>auto scroll</label><br>";
    ?>
    </form><hr>
    <iframe id=myifrm2 name=myifrm2 style="display:none;"></iframe>
    <?php
    if(!isset($_POST["searchtype"]))
    {
    mysql_close($conn);
    exit;
    }

    $linkarr=array();
    $titlearr=array();
    $keyarr=array();
    $inarr=array();
    $sitearr=array();
    $tmplinkarr=array();
    $tmptitlearr=array();
    $tmpsitearr=array();

    foreach($arr as $key=>$value)
    {
    @$content=file_get_contents($value);

    preg_match_all("/<a(.*?)href=(.*?)>(.*?)<\/a>/is",$content,$tmparr);

    foreach($tmparr[3] as $k=>$v)
    {
    if(is_utf8($v))
    $v = iconv("UTF-8","GB2312//IGNORE",$v);
    $v=trim($v);
    $stripv=strip_tags($v);

    $link=str_replace("'","",$tmparr[2][$k]);
    $link=str_replace("\"","",$link);
    $link=split(" ",$link);
    $link=$link[0];
    $link=trim($link);
    if(is_utf8($link))
    $link = iconv("UTF-8","GB2312//IGNORE",$link);

    if(strtolower(substr($link,0,6))=="mailto"||strtolower(substr($link,0,10))=="javascript"||substr($link,0,1)=="{"||substr($link,0,1)=="#"||$link=="/"||$link==""||$link==$value||$link==$value."/")
    continue;
    if(substr($link,0,4)!="http")
    {
    $pos=check($value,"/",3);
    if($pos==-1)
    $link=$value."/".$link;
    else
    $link=substr($value,0,$pos).$link;
    }

    $found=false;
    foreach($arr2 as $k2=>$v2)
    {
    $p=mb_strpos($stripv,$v2,0,"GB2312");

    if($p>-1)
    {
    array_push($linkarr,$link);
    array_push($titlearr,mb_substr($stripv,0,$p,"GB2312")."<font color=fuchsia>".$v2."</font>".mb_substr($stripv,$p+mb_strlen($v2,"GB2312"),mb_strlen($stripv,"GB2312"),"GB2312"));
    array_push($keyarr,$v2);
    array_push($inarr,"<font color='red'>标题</font>");
    array_push($sitearr,$namearr[$key]);
    $found=true;
    break;
    }
    }

    if(isset($_POST["searchtype"])&&$_POST["searchtype"]=="txt")
    {
    if(!$found&&(mb_strlen($stripv,"GB2312")>2||$stripv=="")&&substr_count($link,"/")>3&&substr($link,-1)!="/")
    {
    if($stripv=="")$stripv="无文字标题";
    array_push($tmplinkarr,$link);//link
    array_push($tmptitlearr,$stripv);//title
    array_push($tmpsitearr,$namearr[$key]);//site
    }
    }
    }
    }

    if(isset($_POST["searchtype"])&&$_POST["searchtype"]=="txt")
    {
    $tmplinkarr=array_unique($tmplinkarr);

    $matchtime=time();

    foreach($tmplinkarr as $i=>$link)
    {
    $sql="insert into txttmp(link,title,site,matchtime) values('".$link."','".$tmptitlearr[$i]."','".$tmpsitearr[$i]."',".$matchtime.")";
    $ret = mysql_query($sql,$conn);
    }

    foreach($arr2 as $k2=>$v2)
    {
    $sql="insert into keytmp(keyword,matchtime) values('".$v2."',".$matchtime.")";
    $ret = mysql_query($sql,$conn);
    }

    mysql_close($conn);
    }
    ?>
    <div id=result style="height:600px;overflow-y:scroll">
    <ul style="font-weight:bold;text-align:center;color:white"><li style='width:50px;background:orange'>序号</li><li style='width:400px;background:orange'>标题</li><li style="background:orange">关键词</li><li style="background:orange">位置</li><li style="width:350px;background:orange">网址</li><li style='border-right:1px solid green;background:orange;width:160px'>跟踪时间</li></ul>
    <?php
    $k=-1;
    foreach($linkarr as $k=>$v)
    {
    if($k%2==0)
    $licss="";
    else
    $licss="class='li1'";
    echo "<ul><li $licss style='width:50px'>".($k+1)."</li><li $licss style='width:400px'><a href='".$v."' target=_blank>".$titlearr[$k]."</a></li><li $licss>".$keyarr[$k]."</li><li $licss>".$inarr[$k]."</li><li style='width:350px;' $licss><a href='".$sitearr[$k]."' target=_blank>".$sitearr[$k]."</a></li><li $licss style='border-right:1px solid green;width:160px'>".date("Y-m-d H:i:s")."</li></ul>";
    }?>
    </div>
    <?php
    if(isset($_POST["searchtype"])&&$_POST["searchtype"]=="txt")
    {
    ?>
    <br />
    <iframe id=myifrm src="getkeycontent.php?i=<?php echo $k+2;?>&t=<?php echo $matchtime;?>&s=<?php echo count($tmplinkarr);?>&r=1" style="width:400px;height:30px;overflow:hidden" frameborder="0" scrolling="no"></iframe>
    <?php
    }
    ?>

    getkeycontent.php
    <?php
    require_once("common.php");
    ?>
    <body style="padding:0;margin:0;">
    <?php
    $breaktime = time()+30;//不能小于20
    $step=30;//每次处理的链接个数

    $i=$_GET["i"];
    $s=$_GET["s"];
    $r=$_GET["r"];
    $matchtime=$_GET["t"];

    $times=ceil($s/$step);
    if($times!=0)
    {
    $per=round($r/$times*100);
    if($per>99)$per=99;
    }

    $perstr= "<span style='width:300px;height:20px;line-height:20px;vertical-align:middle'><span style='width:100px;float:left'>matching...</span><span style='width:100px;float:left;background-color:#ddd'><span style='width:".$per."px;background-color:#FFBF2A;float:left;overflow:hidden'>&nbsp;</span></span><span style='width:30px;float:left;color:red;font-size:12px;'>".$per."%</span><span style='width:20px;float:left'>^O^</span></span>";

    echo "<div id=echoinfo> $perstr </div>";

    $sql="select * from txttmp where matchtime=$matchtime order by id limit ".$step;
    $ret = mysql_query($sql,$conn);

    if(mysql_affected_rows()<=0)
    {
    $sql="delete from keytmp where matchtime=$matchtime";
    $ret = mysql_query($sql,$conn);
    echo "<script>document.getElementById('echoinfo').innerHTML='done *_*';</script>";
    mysql_close($conn);
    exit;
    }

    $sql="select * from keytmp where matchtime=$matchtime";
    $ret2 = mysql_query($sql,$conn);

    $arr2=array();
    while($row2 = mysql_fetch_object($ret2))
    {
    array_push($arr2,$row2->keyword);
    }

    $str="";

    $j=0;
    $tmpidarr=array();
    $tmplinkarr=array();
    $tmptitlearr=array();
    $tmpsitearr=array();
    $connarr=array();
    $mh = curl_multi_init();

    while($row = mysql_fetch_object($ret))
    {
    $link=trim($row->link);

    $connarr[$j]=curl_init($link);
    curl_setopt($connarr[$j],CURLOPT_RETURNTRANSFER,1);

    curl_setopt($connarr[$j], CURLOPT_TIMEOUT, 20);//这个非常关键

    curl_multi_add_handle($mh,$connarr[$j]);

    array_push($tmpidarr,$row->id);//link
    array_push($tmplinkarr,$link);//link
    array_push($tmptitlearr,trim($row->title));//title
    array_push($tmpsitearr,$row->site);//site

    $j++;
    }

    do {
    $mrc = curl_multi_exec($mh,$active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);

    while ($active and $mrc == CURLM_OK) {
    if (curl_multi_select($mh) != -1) {
    do {
    $mrc = curl_multi_exec($mh, $active);
    } while ($mrc == CURLM_CALL_MULTI_PERFORM);
    }
    }

    foreach ($connarr as $j => $url) {

    try {

    $content2=curl_multi_getcontent($url);

    } catch (Exception $e) {

    curl_close($url);

    $sql="delete from txttmp where id=".$tmpidarr[$j];
    $ret3 = mysql_query($sql,$conn);

    continue;
    }

    preg_match('/<body[^>]*>(.*)<\/body>/is',$content2,$tmparr);//取body内容
    $content2=preg_replace("/<(script.*?)>(.*?)<(\/script.*?)>/si","",$tmparr[0]); //过滤script标签及语句
    $content2=preg_replace("/<(style.*?)>(.*?)<(\/style.*?)>/si","",$content2); //过滤style标签及语句
    $content3=trim(strip_tags($content2));

    if($content3==""||mb_strlen($content3)<50)//无内容或字符过少
    {
    curl_close($url);

    $sql="delete from txttmp where id=".$tmpidarr[$j];
    $ret3 = mysql_query($sql,$conn);

    continue;
    }

    if(is_utf8($content2))
    $content2= iconv("UTF-8","GB2312//IGNORE",$content2);

    preg_match_all("/<a(.*?)href=(.*?)>(.*?)<\/a>/is",$content2,$linkarr); //取所有链接
    $linkarr[2]=array_unique($linkarr[2]);
    $linkarr[3]=array_unique($linkarr[3]);
    $content2=preg_replace("/<(a.*?)>(.*?)<(\/a.*?)>/si","",$content2); //过滤a标签及语句
    $content2=trim(strip_tags($content2));

    //取得返回的结果,并显示
    foreach($arr2 as $k2=>$v2)
    {
    if(mb_strpos($content2,$v2,0,"GB2312")>-1)
    {
    if($i%2==0)
    $licss="class='li1'";
    else
    $licss="";

    $content2=str_replace("\r\n"," ",$content2);
    $content2=str_replace("\n"," ",$content2);
    $content2=str_replace("\r"," ",$content2);
    $content2=str_replace("\t"," ",$content2);
    $content2=str_replace("\0"," ",$content2);
    $content2=str_replace("\x0B"," ",$content2);

    $str .= "<ul><li $licss style='width:50px'>".($i++)."</li><li $licss style='width:400px'><a href='".$tmplinkarr[$j]."' target=_blank title='".str_replace($v2,"【".$v2."】", mb_substr($content2,mb_strpos($content2,$v2,0,"GB2312")-40,120,"GB2312"))."'>".$tmptitlearr[$j]."</a></li><li $licss>".$v2."</li><li $licss><font color='blue'>内文</font></li><li $licss style='width:350px'><a href='".$tmpsitearr[$j]."' target=_blank>".$tmpsitearr[$j]."</a></li><li $licss style='border-right:1px solid green;width:160px'>".date("Y-m-d H:i:s")."</li></ul>";
    break;
    }
    }

    foreach($linkarr[3] as $k=>$v)
    {
    if(!isset($linkarr[2][$k]))continue;

    $v=trim($v);
    $stripv=strip_tags($v);

    $link=str_replace("'","",$linkarr[2][$k]);
    $link=str_replace("\"","",$link);
    $link=split(" ",$link);
    $link=$link[0];
    $link=trim($link);

    if($lastlink==$link||$lasttitle==$stripv)continue;

    if(strtolower(substr($link,0,6))=="mailto"||strtolower(substr($link,0,10))=="javascript"||substr($link,0,1)=="{"||substr($link,0,1)=="#"||$link=="/"||$link==""||$link==$url||$link==$url."/")
    continue;

    if(substr($link,0,4)!="http")
    {
    $pos=check($tmplinkarr[$j],"/",3);
    if($pos==-1)
    $link=$tmplinkarr[$j]."/".$link;
    else
    $link=substr($tmplinkarr[$j],0,strrpos($tmplinkarr[$j],"/"))."/".$link;
    }

    $stripv=str_replace("\r\n"," ",$stripv);
    $stripv=str_replace("\n"," ",$stripv);
    $stripv=str_replace("\r"," ",$stripv);
    $stripv=str_replace("\t"," ",$stripv);
    $stripv=str_replace("\0"," ",$stripv);
    $stripv=str_replace("\x0B"," ",$stripv);

    $found=false;
    foreach($arr2 as $k2=>$v2)
    {
    $p=mb_strpos($stripv,$v2,0,"GB2312");

    if($p>-1)
    {
    if($i%2==0)
    $licss="class='li1'";
    else
    $licss="";

    $str .= "<ul><li $licss style='width:50px'>".($i++)."</li><li $licss style='width:400px'><a href='".$link."' target=_blank>".mb_substr($stripv,0,$p,"GB2312")."<font color=fuchsia>".$v2."</font>".mb_substr($stripv,$p+mb_strlen($v2,"GB2312"),mb_strlen($stripv,"GB2312"),"GB2312")."</a></li><li $licss>".$v2."</li><li $licss>"."<font color='red'>内页标题</font>"."</li><li style='width:350px;overflow:hidden;white-space:nowrap;' $licss><a href='".$tmplinkarr[$j]."' target=_blank>".$tmplinkarr[$j]."</a></li><li $licss style='border-right:1px solid green;width:160px'>".date("Y-m-d H:i:s")."</li></ul>";

    $lastlink=$link;
    $lasttitle=$stripv;

    $found=true;
    break;
    }
    }
    }

    //关闭CURL
    curl_close($url);

    $sql="delete from txttmp where id=".$tmpidarr[$j];
    $ret3 = mysql_query($sql,$conn);

    if(time()>=$breaktime)break;

    }
    echo "<script>top.document.getElementById('result').innerHTML+='".addslashes($str)."';</script>\n";

    curl_multi_close($mh);
    mysql_close($conn);

    if(time()<$breaktime-28)//页面刷新间隔在2秒以内时,需延时
    sleep($breaktime-28-time());
    ?>
    <script>
    if(top.document.getElementById("mychk").checked)
    top.document.getElementById('result').scrollTop=top.document.getElementById('result').scrollHeight;
    currentlink="getkeycontent.php?i=<?php echo $i;?>&t=<?php echo $matchtime;?>&s=<?php echo $s;?>&r=<?php echo ++$r;?>";
    if(!top.breakit)
    top.document.getElementById('myifrm').src=currentlink;
    else
    {
    top.document.getElementById('currentlink').value=currentlink;
    document.getElementById('echoinfo').innerHTML='break :-)';
    }
    </script>
    </body>

    分享到:

    历史上的今天: