抓取邮件地址
/*
** 功能:抓取给定url里的邮件地址,并写入到txtname文本文件里
** 参数:$url
** 返回值:
*/
function findEmail($url,$tname=’emails’) {
$conarr=file($url);
if(is_array($conarr)) {
for($i=0; $i
preg_match_all('/[a-zA-Z]([A-Za-z0-9]*[-_\.]?[A-Za-z0-9]+)*@([A-Za-z0-9]*[-_]?[A-Za-z0-9]+)+[\.][A-Za-z]{2,3}([\.][A-Za-z]{2,3})?/', $conarr[$i], $matches);
if(!empty($matches[0])) {
$matches[0]=array_unique($matches[0]);
$arr[]=$matches[0][0];
}
}
}
if(!empty($arr)) {
$arr=array_unique($arr);
$total=sizeof($arr);
$email=implode (",", $arr);
$email=$email.",\n";
//写入txt
$txtname=$tname.'.txt';
$fp=fopen($txtname,'a');
fwrite($fp,$email);
fclose($fp);
echo "from: ".$url." I get ".$total." email address.";
} else {
echo "from: ".$url." I get no any email...";
}
}
}
/*
** 功能:抓取给定url里所有的链接地址
** 参数:$url
** 返回值:返回所有链接地址的数组array
*/
function findUrl($url) {
//echo $url;
$webarr=array('www.w3.org','maps.google.com','map.baidu.com','maps.google.nl');
$conarr=file($url);
if(is_array($conarr)) {
foreach($conarr as $key=>$val) {
if(substr_count($val,’http://’)) {
preg_match_all(”/http(s)?:\/\/[^\s”‘\>)< ]*/",$val, $murl);
foreach($murl[0] as $k=>$v) {
$uarr[]=$v;
}
}
}
}
$uarr=array_values(array_unique($uarr));
foreach($uarr as $key=>$val) {
@$pu=parse_url($val);
if($pu) {
if((!in_array($pu[’host’],$webarr)) && (preg_match(’/^([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}$/’, $pu[’host’]))) { //$pu[’host’]!=’www.w3.org’
$urlArr[]=$val;
}
}
}
return $urlArr;
}