前言:
需要在百度AI平台注册登录并创建项目。
爬虫代码
1 import scrapy 2 from BaiDuAi.items import BaiduaiItem 3 4 class AiSpider(scrapy.Spider): 5 name = 'ai' 6 # allowed_domains = ['www.xxx.com'] 7 #人民网url 8 start_urls = ['http://politics.people.com.cn/n1/2018/1217/c1001-30470023.html'] 9 10 def parse(self, response):11 title=response.xpath('/html/body/div[4]/h1/text()').extract_first()12 content=response.xpath('//*[@id="rwb_zw"]//text()').extract()13 content=''.join(content).strip('\n \t')14 item=BaiduaiItem()15 item['title']=title16 item['content']=content17 18 yield item
管道代码
1 from aip import AipNlp 2 3 """ 你的 APPID AK SK """ 4 APP_ID = '15198150' 5 API_KEY = 'jaObSr6rmSmqsjWfKGGpmwxB' 6 SECRET_KEY = '808Eiz4FPkfMwS2ajClXYhKrcFMN1YUN' 7 8 client = AipNlp(APP_ID, API_KEY, SECRET_KEY) 9 10 class BaiduaiPipeline(object):11 keys=[]12 def process_item(self, item, spider):13 title=item['title'].replace('\xa0','')14 content=item['content'].replace('\xa0','')15 keys_dict=client.keyword(title,content)16 for dic in keys_dict['items']:17 self.keys.append(dic['tag'])18 19 20 keys="/".join(self.keys)21 typec_dic=client.topic(title,content)22 news_type=typec_dic['item']['lv1_tag_list'][0]['tag']23 24 with open('./xinwen.html','w',encoding='utf-8')as fp:25 fp.write(title+'\n\n'+content+'\n\n'+keys+'\n\n'+news_type)26 return item