`

抓取网页数据导入到entity

 
阅读更多
function one_crawl($form, $form_state) {
  $link = 'http://blog.eau-thermale-avene.cn/post/536.html';
  blog_crawler($link);
}

//save url content to node:blog.
function blog_crawler($link, &$context = array(), $summary = NULL) {
  if (empty($link)) return;
  watchdog('crawler link', $link);
  $default = array(
    'uid' => 1,
  );
  $fetch = new BlogFetch('blog', $link, $default);
  $blog = str_get_html($fetch->html);

  //Blog Pushlish date.
  $date = $blog->find('#divMain h2.post-title span.post-date', 0)->plaintext;
  $blog->find('#divMain h2.post-title span.post-date', 0)->innertext = '';
  //dsm($date);
  $fetch->set('created', strtotime($date));
  //Entity property changed doesn't support writing。。。
  //$fetch->set('changed', strtotime($date));

  //Blog Title.
  $blog->find('#divMain h2.post-title span', 0)->outertext = '';
  $title = $blog->find('#divMain h2.post-title', 0)->innertext;
  $title = str_replace('Q&A', 'Q&A', $title);
  $fetch->set('title', $title);

  //Blog Original URL.
  $fetch->set('field_link', array('url' => $link));

  //Blog Tags.
  $terms = array();
  $tags = $blog->find('#divMain div.post-body .post-info a');
  foreach ($tags as $tag) {
    if(!empty($tag->plaintext)) {
      $terms[] = avene_taxonomy($tag->plaintext, 'blog_tags');
    }
  }
  $fetch->set('field_blog_tags', $terms);

  //Blog Category..
  $cate = drupal_substr(trim($blog->find('#divMain div.post-body .post-footer-category', 0)->plaintext),3);
  $term = avene_taxonomy($cate, 'blog_category');
  $fetch->set('field_blog_type', $term);

  //Blog Body.
  $blog->find('#divMain div.post-body .post-info', 0)->outertext = '';
  $blog->find('#divMain div.post-body .post-footer', 0)->outertext = '';
  $body = $blog->find('#divMain div.post-body', 0)->innertext;
  $fetch->set('body', array('format' => 'full_html', 'summary' => $summary, 'value' => $body));

  //Blog comments...
  foreach ($blog->find('.msg-boxes .msg-box-content') as $c) {
    if(empty($c->plaintext)) return;
    $comment->nid = $fetch->entity->nid; // nid of a node you want to attach a comment to
    $comment->cid = 0; // leave it as is
    $comment->pid = 0; // parent comment id, 0 if none
    $comment->uid = 0; // user's id, who left the comment
    //$comment->mail = 'email@example.com'; // user's email
    //$comment->name = 'User name'; // If user is authenticated you can omit this field, it will be auto-populated, if the user is anonymous and you want to name him somehow, input his name here
    //$comment->thread = '01/'; // OPTIONAL. If you need comments to be threaded you can fill this value. Otherwise omit it.
    $c->find('.msgtime a', 0)->outertext = '';
    $comment->created = strtotime(trim($c->find('.msgtime', 0)->plaintext)); 
    $comment->is_anonymous = 1; // leave it as is
    $comment->status = COMMENT_PUBLISHED; // We auto-publish this comment
    $comment->language = LANGUAGE_NONE; // The same as for a node
    $comment->subject = '';
    $val = filter_var($c->find('.msgarticle', 0)->plaintext, FILTER_SANITIZE_SPECIAL_CHARS);
    //$val = preg_replace('/[^(\x20-\x7F)]*/','', $val);
     $replace = array( ''=>'');
    $val = strtr($val, $replace);
    $comment->comment_body[$comment->language][0]['value'] = $val;
    $comment->comment_body[$comment->language][0]['format'] = 'full_html';
    comment_submit($comment); // saving a comment
    comment_save($comment);
  }
  //$context['results']['processed']++;
  $context['message'] = 'fetching ' . $fetch->entity->title;

}

class BlogFetch {
  protected $type;
  protected $url;
  protected $args;
  var $html;
  var $entity;
  function __construct($type, $url, $args = array()) {
    $this->type = $type;
    $this->args = $args;
    $this->url = $url;
    $this->fetchData();
    $this->buildEntity();
  }
  //Init entity...
  function buildEntity() {
    $args = array('type' => $this->type) + $this->args;
    $this->entity = entity_create('node', $args);
  }

  function fetchData() {
    $request = drupal_http_request($this->url);
    if ($request->code == 200) {
      $this->html = $request->data;
    } else {
      throw new Exception('Failure on fetch:' . $this->url . '. http code:' . $request->code);
      return FALSE;
    }
  }

  public function set($property, $value, $type = NULL) {
    $wrapper = entity_metadata_wrapper('node', $this->entity);
    $wrapper->{$property}->set($value);
    $wrapper->save();
  }

  function __destruct() {
    //$this->html->clear();
    $this->html = NULL;
    $this->entity = NULL;
  }
}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics