Day10 学习笔记

爬虫整体流程(分页 → 列表页 → 详情页 → 入库)

  • 核心步骤:
  1. 构造请求(带 User-Agent)获取 HTML
  2. 在列表页提取详情链接集合
  3. 逐个请求详情页并用正则提取字段
  4. 组织结构化数据并写入数据库(或文件)

请求与伪装头

  • 使用 urllib2 构造带 UA 的请求,读取页面源码:
1
2
3
4
5
6
7
8
9
import urllib2

headers = {"User-Agent": "Mozilla/4.0 (compatible;MSIE 8.0; Windows NT 6.1)"}
url = "http://bj.58.com/chuzu/pn1/"

req = urllib2.Request(url, headers=headers)
resp = urllib2.urlopen(req)
html = resp.read()
print len(html)

列表页解析(提取详情链接)

  • 通过正则在列表页中抓取每条租房信息的详情链接:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import re

pattern = re.compile(
'<li\s*logr.*?'
'<div\s*class="des".*?'
'<h2>.*?'
'<a\s*href="(.*?)"\s*tongji.*?'
'</h2>.*?'
'</li>',
re.S
)

url_list = re.findall(pattern, html) # 形如 //bj.58.com/....
detail_urls = ["http:" + u for u in url_list]
print "详情链接数量:", len(detail_urls)

详情页解析(字段提取)

  • 常用字段:租金、支付方式、租赁方式、房屋类型、小区名、详细地址、联系人、电话等。
  • 使用 re.search 与分组,配合 re.S 处理跨行匹配:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def parse_detail(html):
import re
# 租金与支付方式
m1 = re.search(
r'<div\s*class="main-wrap">.*?'
r'<div\s*class="house-pay-way\s*.*?'
r'<b\s*class=.*?>([0-9]*)</b>.*?'
r'<span\s*class=.*?>(.*?)</span>.*?',
html, re.S)

# 租赁方式、房屋类型、小区、地址
m2 = re.search(
r'<div\s*class="main-wrap">.*?'
r'<div\s*class="house-desc-item.*?'
r'<ul\s*class=.*?'
r'<li>.*?租赁方式.*?<span>(.*?)</span>.*?</li>.*?'
r'<li>.*?房屋类型.*?<span>(.*?)\s*&.*?</li>.*?'
r'<li>.*?所在小区.*?onclick=.*?>(.*?)</a>.*?</li>.*?'
r'class=.*?详细地址.*?class="dz.*?"\s*>\s*(.*?)\s*</span>',
html, re.S)

# 联系人与电话
m3 = re.search(
r'<div\s*class="main-wrap">.*?'
r'<div\s*class="house-agent-info.*?'
r'<p\s*class="agent-name.*?onclick=.*?>(.*?)</a>.*?</p>.*?'
r'<div\s*class="house-fraud-tip">.*?'
r'<span\s*class="house-chat-txt">([0-9]*|.*?)</span>',
html, re.S)

if not (m1 and m2 and m3):
return None

house_rent, pay_way = m1.group(1).strip(), m1.group(2).strip()
lease_way, house_type, houses_name, houses_address = (
m2.group(1).strip(), m2.group(2).strip(), m2.group(3).strip(), m2.group(4).strip()
)
linkman, phone = m3.group(1).strip(), m3.group(2).strip()

return {
'houses_name': houses_name,
'house_type': house_type,
'lease_way': lease_way,
'house_rent': house_rent,
'pay_way': pay_way,
'linkman': linkman,
'phone': phone,
'houses_address': houses_address,
}

页面遍历与限速

  • 逐页抓取时可在详情抓取间 time.sleep(1),避免过快请求:
1
2
3
4
5
6
7
8
9
10
import time

results = []
for detail_url in detail_urls:
html2 = urllib2.urlopen(urllib2.Request(detail_url, headers=headers)).read()
item = parse_detail(html2)
if item:
results.append(item)
time.sleep(1)
print "抓取到记录:", len(results)

Oracle 入库流程(cx_Oracle)

  • 连接、建表、插入与提交:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
import cx_Oracle

db = cx_Oracle.connect("sh", "cfyf123456", "127.0.0.1:1521/orcl")
cursor = db.cursor()

cursor.execute(
"CREATE TABLE WEB_CRAWLERS_58CITY("
"houses_name varchar2(200), house_type varchar2(200), lease_way varchar2(200), "
"house_rent varchar2(200), pay_way varchar2(200), linkman varchar2(200), phone varchar2(200), houses_address varchar2(200))"
)
db.commit()

for it in results:
cursor.execute(
"INSERT INTO WEB_CRAWLERS_58CITY(houses_name, house_type, lease_way, house_rent, pay_way, linkman, phone, houses_address) "
"VALUES (:1, :2, :3, :4, :5, :6, :7, :8)",
(it['houses_name'], it['house_type'], it['lease_way'], it['house_rent'], it['pay_way'], it['linkman'], it['phone'], it['houses_address'])
)
db.commit()
cursor.close(); db.close()

运行驱动

  • 控制变量与主循环组织示例:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
class Spider58(object):
def __init__(self):
self.pageIndex = 1
self.enable = True
self.headers = {"User-Agent": "Mozilla/4.0 (compatible;MSIE 8.0; Windows NT 6.1)"}

def load_page(self):
url = "http://bj.58.com/chuzu/pn%s/" % self.pageIndex
html = urllib2.urlopen(urllib2.Request(url, headers=self.headers)).read()
urls = ["http:" + u for u in re.findall(pattern, html)]
self.pageIndex += 1
return urls

def run(self):
while self.enable:
urls = self.load_page()
if not urls:
break
for u in urls:
item = parse_detail(urllib2.urlopen(urllib2.Request(u, headers=self.headers)).read())
if item:
print item['houses_name'], item['house_rent']
time.sleep(1)

Spider58().run()

Day10 学习笔记
https://blog.pangcy.cn/2018/10/24/后端编程相关/python/python2基础/day10 学习笔记/
作者
子洋
发布于
2018年10月24日
许可协议