当先锋百科网

首页 1 2 3 4 5 6 7

这次运用数据来存储,并从中提取数据

成果展示

部分成果1.png

部分成果2.png

代码

import requests

import time

from bs4 import BeautifulSoup

from pymongo import MongoClient

client = MongoClient('localhost', 27017)

PhoneNum = client['PhoneNum']

url_list = PhoneNum['url_list']

phone_info = PhoneNum['phone_info']]

# spider 1

def get_phone_link(pages):

url = 'http://bj.58.com/shoujihao/pn{}/'.format(str(pages))

wb_data = requests.get(url)

time.sleep(1)

soup = BeautifulSoup(wb_data.text, 'lxml')

links = soup.select('li a.t')

numbers = soup.select('strong')

lk = soup.select('#infocont > span > b')

if '0' not in lk:

for number,link in zip(numbers,links):

data = {

'number': number.get_text(),

'PhoneLink' : link.get('href').split('?')[0]

}

print(data)

url_list.insert_one(data)

else:

pass

# spider 2

def get_item_info(url):

wb_data = requests.get(url)

time.sleep(1)

soup = BeautifulSoup(wb_data.text,'lxml')

titles = soup.select('div.col_sub.mainTitle > h1')

prices = soup.select('div.col_sub.sumary >ul > li > div.su_con > span')

for title, price in zip(titles,prices):

data = {

'title' :( title.get_text()).replace("\n","").replace(" ",""),

'price' : (price.get_text()).replace("\n","").replace(" ","")

}

print(data)

phone_info.insert_one(data)

for page in range(1, 200):

get_phone_link(page)

for info in url_list.find():

url = info["url"]

get_item_info(url)

总结

遇到需要采集大量的数据,最好的方式就是将功能分离开来。一次只执行一个动作