采集

采集

PHP通过pthreads扩展实现真正的多线程采集

孤魂 发表了文章 • 0 个评论 • 1502 次浏览 • 2015-12-25 09:11 • 来自相关话题

最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。<?php
set_time_limit(0);

class new_thread_run extends Thread
{
public $url;
public $data;
public function __construct($url)
{
$this->url = $url;
}
public function run()
{
if (($url = $this->url)) {
$this->data = model_http_curl_get($url);
}
}
}
function model_thread_result_get($urls_array)
{
if (class_exists('Thread')) {
foreach ($urls_array as $key => $value) {
$thread_array[$key] = new new_thread_run($value);
$thread_array[$key]->start();
}
foreach ($thread_array as $thread_array_key => $thread_array_value) {
while ($thread_array[$thread_array_key]->isRunning()) {
usleep(10);
}
if ($thread_array[$thread_array_key]->join()) {
$variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
}
}
} else {
foreach ($urls_array as $key => $value) {
$variable_data[$key] = model_http_curl_get($value);
}
}
return $variable_data;
}
function model_http_curl_get($url)
{
$userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 20);
curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
$result = curl_exec($curl);
curl_close($curl);
return $result;
}

//实际例子
for ($i = 0; $i < 50; $i++) {
$urls_array = "http://www.baidu.com/s?wd=" . mt_rand(10000, 20000);
}
$t = microtime(true);
$result = model_thread_result_get($urls_array);
$e = microtime(true);
echo "多线程:" . ($e - $t) . "\n";
?>参考链接:
http://www.thinkphp.cn/topic/22676.htmlhttp://zyan.cc/pthreads/ 查看全部
最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。
<?php
set_time_limit(0);

class new_thread_run extends Thread
{
public $url;
public $data;
public function __construct($url)
{
$this->url = $url;
}
public function run()
{
if (($url = $this->url)) {
$this->data = model_http_curl_get($url);
}
}
}
function model_thread_result_get($urls_array)
{
if (class_exists('Thread')) {
foreach ($urls_array as $key => $value) {
$thread_array[$key] = new new_thread_run($value);
$thread_array[$key]->start();
}
foreach ($thread_array as $thread_array_key => $thread_array_value) {
while ($thread_array[$thread_array_key]->isRunning()) {
usleep(10);
}
if ($thread_array[$thread_array_key]->join()) {
$variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
}
}
} else {
foreach ($urls_array as $key => $value) {
$variable_data[$key] = model_http_curl_get($value);
}
}
return $variable_data;
}
function model_http_curl_get($url)
{
$userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 20);
curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
$result = curl_exec($curl);
curl_close($curl);
return $result;
}

//实际例子
for ($i = 0; $i < 50; $i++) {
$urls_array = "http://www.baidu.com/s?wd=" . mt_rand(10000, 20000);
}
$t = microtime(true);
$result = model_thread_result_get($urls_array);
$e = microtime(true);
echo "多线程:" . ($e - $t) . "\n";
?>
参考链接:

PHP通过pthreads扩展实现真正的多线程采集

孤魂 发表了文章 • 0 个评论 • 1502 次浏览 • 2015-12-25 09:11 • 来自相关话题

最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。<?php
set_time_limit(0);

class new_thread_run extends Thread
{
public $url;
public $data;
public function __construct($url)
{
$this->url = $url;
}
public function run()
{
if (($url = $this->url)) {
$this->data = model_http_curl_get($url);
}
}
}
function model_thread_result_get($urls_array)
{
if (class_exists('Thread')) {
foreach ($urls_array as $key => $value) {
$thread_array[$key] = new new_thread_run($value);
$thread_array[$key]->start();
}
foreach ($thread_array as $thread_array_key => $thread_array_value) {
while ($thread_array[$thread_array_key]->isRunning()) {
usleep(10);
}
if ($thread_array[$thread_array_key]->join()) {
$variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
}
}
} else {
foreach ($urls_array as $key => $value) {
$variable_data[$key] = model_http_curl_get($value);
}
}
return $variable_data;
}
function model_http_curl_get($url)
{
$userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 20);
curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
$result = curl_exec($curl);
curl_close($curl);
return $result;
}

//实际例子
for ($i = 0; $i < 50; $i++) {
$urls_array = "http://www.baidu.com/s?wd=" . mt_rand(10000, 20000);
}
$t = microtime(true);
$result = model_thread_result_get($urls_array);
$e = microtime(true);
echo "多线程:" . ($e - $t) . "\n";
?>参考链接:
http://www.thinkphp.cn/topic/22676.htmlhttp://zyan.cc/pthreads/ 查看全部
最近自己的项目采集,一直在使用PHP CURL的功能在进行采集,使用命令行执行PHP文件,解决了PHP运行超时的问题,但只能单线程采集。最近找到了使用pthreads实现多线程采集的方法,这里安装方法就不在详细说明了,如果你使用Phpstudy的套件的话,需要注意到两点,一是选择好正确的版本,php 5.x只能使用2.09以下的版本;其次是需要将php_pthreads.dll放在ext目录,然后在php.ini文件中加载此文件;最后需要将pthreadVC2.dll分别复制到./PHPa/目录和./Apache/bin/目录。下面分享一下我的采集源码。
<?php
set_time_limit(0);

class new_thread_run extends Thread
{
public $url;
public $data;
public function __construct($url)
{
$this->url = $url;
}
public function run()
{
if (($url = $this->url)) {
$this->data = model_http_curl_get($url);
}
}
}
function model_thread_result_get($urls_array)
{
if (class_exists('Thread')) {
foreach ($urls_array as $key => $value) {
$thread_array[$key] = new new_thread_run($value);
$thread_array[$key]->start();
}
foreach ($thread_array as $thread_array_key => $thread_array_value) {
while ($thread_array[$thread_array_key]->isRunning()) {
usleep(10);
}
if ($thread_array[$thread_array_key]->join()) {
$variable_data[$thread_array_key] = $thread_array[$thread_array_key]->data;
}
}
} else {
foreach ($urls_array as $key => $value) {
$variable_data[$key] = model_http_curl_get($value);
}
}
return $variable_data;
}
function model_http_curl_get($url)
{
$userAgent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2)';
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_TIMEOUT, 20);
curl_setopt($curl, CURLOPT_USERAGENT, $userAgent);
curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
$result = curl_exec($curl);
curl_close($curl);
return $result;
}

//实际例子
for ($i = 0; $i < 50; $i++) {
$urls_array = "http://www.baidu.com/s?wd=" . mt_rand(10000, 20000);
}
$t = microtime(true);
$result = model_thread_result_get($urls_array);
$e = microtime(true);
echo "多线程:" . ($e - $t) . "\n";
?>
参考链接: