How to Scrape Beike Real‑Estate Listings with Python: A Complete Guide
This tutorial walks you through building a Python web‑scraper for Beike (Lianjia) second‑hand property listings, covering session spoofing, dynamic parameters, pagination, multithreaded detail fetching, data cleaning, and exporting results to Excel in a step‑by‑step manner.
Introduction
The article demonstrates how to crawl second‑hand community data from the Beike website, whose anti‑scraping measures are more complex than those of Anjuke. Most of the tutorial content was generated with AI, so readers are encouraged to ask questions in the comments.
Overall Architecture
The scraper consists of session initialization, parameter generation, list‑page fetching and parsing, detail‑page fetching with multithreading, data cleaning, and finally saving all results to an Excel file.
Key Code Overview
<code>import requests
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
import math
from concurrent.futures import ThreadPoolExecutor, as_completed
</code>1. Session Initialization (Browser Spoofing)
<code>def init_session(config):
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': f"https://{config['city']}.ke.com/"
})
session.cookies.update(config['cookies'])
session.config = config # store config for later use
return session
</code>2. Dynamic Parameter Generation (Anti‑Scraping Core)
<code>def get_params(session):
return {
'_t': str(int(time.time() * 1000)), # 13‑digit timestamp
'srcid': session.config['srcid'] # device fingerprint
}
</code>3. List Page Fetching
<code>def fetch_list_page(session, page_url):
time.sleep(random.uniform(0.2, 0.4))
response = session.get(page_url, params=get_params(session), timeout=8)
response.raise_for_status()
return parse_list_page(response.text)
</code>4. List Page Parsing
<code>def parse_list_page(html):
soup = BeautifulSoup(html, 'html.parser')
items = soup.select('li.xiaoquListItem')
results = []
for item in items:
try:
info = {
'小区名称': item.select_one('.title a').text.strip(),
'参考均价': f"{item.select_one('.totalPrice span').text}元/㎡" if item.select_one('.totalPrice') else '暂无数据',
'成交信息': item.select_one('.houseInfo a[href*="chengjiao"]').text.strip() if item.select_one('.houseInfo a[href*="chengjiao"]') else '暂无成交',
'出租信息': item.select_one('.houseInfo a[href*="zufang"]').text.strip() if item.select_one('.houseInfo a[href*="zufang"]') else '暂无出租',
'行政区': item.select_one('.district').text.strip() if item.select_one('.district') else '未知区域',
'商圈': item.select_one('.bizcircle').text.strip() if item.select_one('.bizcircle') else '未知商圈',
'建筑年代': ' '.join(item.select_one('.positionInfo').stripped_strings).split('/')[-1].strip() if item.select_one('.positionInfo') else '未知',
'详情页链接': item.select_one('a.maidian-detail[href]')['href']
}
results.append(info)
except Exception as e:
print(f"解析异常: {e}")
return results
</code>5. Multithreaded Detail Page Fetching
<code>def fetch_detail_batch(session, urls):
details = {}
with ThreadPoolExecutor(max_workers=3) as executor:
future_to_url = {executor.submit(parse_detail_page, session, url): url for url in urls}
for future in as_completed(future_to_url):
url = future_to_url[future]
details[url] = future.result()
time.sleep(random.uniform(0.2, 0.4))
return details
</code>6. Detail Page Parsing and Data Cleaning
<code>def parse_detail_page(session, url):
try:
time.sleep(random.uniform(0.6, 1.0))
response = session.get(url, params=get_params(session), timeout=10)
soup = BeautifulSoup(response.text, 'html.parser')
# Helper to safely extract single‑line fields
def safe_extract(label):
try:
item = soup.find('span', class_='xiaoquInfoLabel', string=label)
return item.find_next('span', class_='xiaoquInfoContent').text.strip()
except:
return '暂无数据'
# Extract multi‑column information
data = {}
for col in soup.select('.xiaoquInfoItemCol'):
for item in col.select('.xiaoquInfoItem'):
lbl = item.select_one('.xiaoquInfoLabel').text.strip()
val = item.select_one('.xiaoquInfoContent').text.strip()
data[lbl] = val
detail = {
'建筑类型': data.get('建筑类型', '暂无数据'),
'房屋总数': ''.join(filter(str.isdigit, data.get('房屋总数', '')) ) or '0',
'楼栋总数': ''.join(filter(str.isdigit, data.get('楼栋总数', '')) ) or '0',
'绿化率': data.get('绿化率', '').replace('%', '').strip(),
'容积率': data.get('容积率', '暂无数据'),
'交易权属': data.get('交易权属', '暂无数据'),
'建成年代': data.get('建成年代', '暂无数据'),
'供暖类型': data.get('供暖类型', '暂无数据'),
'用水类型': data.get('用水类型', '暂无数据'),
'用电类型': data.get('用电类型', '暂无数据'),
'物业费': safe_extract('物业费').split('元')[0].strip(),
'附近门店': ' '.join(safe_extract('附近门店').replace('\n', ' ').split()),
'物业公司': safe_extract('物业公司'),
'开发商': safe_extract('开发商'),
'详情页均价': f"{soup.select_one('.xiaoquUnitPrice').text.strip()}元/㎡" if soup.select_one('.xiaoquUnitPrice') else '暂无数据'
}
return detail
except Exception as e:
print(f"详情页解析异常: {e}")
return {}
</code>7. Full Crawl Workflow
<code>def crawl_full_data(session):
config = session.config
try:
resp = session.get(f"https://{config['city']}.ke.com/xiaoqu/{config['region']}/", params=get_params(session))
soup = BeautifulSoup(resp.text, 'html.parser')
total = int(soup.select_one('h2.total span').text)
total_pages = math.ceil(total / 30)
print(f"当前区域共有 {total} 个小区, 需要爬取 {total_pages} 页数据")
except Exception as e:
print(f"获取总数失败: {e}")
total_pages = 1
all_data = []
for page in range(1, total_pages + 1):
page_url = f"https://{config['city']}.ke.com/xiaoqu/{config['region']}/p{page}"
for retry in range(2):
try:
list_data = fetch_list_page(session, page_url)
detail_urls = [item['详情页链接'] for item in list_data]
detail_results = fetch_detail_batch(session, detail_urls)
for item in list_data:
item.update(detail_results.get(item['详情页链接'], {}))
all_data.extend(list_data)
print(f"第{page}页完成,累计 {len(all_data)} 条数据")
break
except Exception as e:
print(f"第{retry+1}次重试失败: {e}")
time.sleep(random.uniform(0.2, 0.4))
return all_data
</code>8. Execution Example
<code>if __name__ == "__main__":
CONFIG = {
"city": "fs", # e.g., 佛山 -> fs
"region": "nanhai", # e.g., 南海区 -> nanhai
"cookies": {
"lianjia_uuid": "<your_uuid>",
"lianjia_token": "<your_token>",
"security_ticket": "<your_ticket>"
},
"srcid": "<your_srcid>"
}
output_name = f"{CONFIG['city']}_{CONFIG['region']}_小区数据.xlsx"
session = init_session(CONFIG)
start_time = time.time()
final_data = crawl_full_data(session)
if final_data:
df = pd.DataFrame(final_data)[[
'小区名称','参考均价','成交信息','出租信息','行政区','商圈','建筑年代',
'详情页均价','建筑类型','房屋总数','楼栋总数','绿化率','容积率',
'交易权属','建成年代','供暖类型','用水类型','用电类型',
'物业费','附近门店','物业公司','开发商','详情页链接'
]]
df.to_excel(output_name, index=False)
print(f"数据已保存至: {output_name}")
print(f"总计 {len(df)} 条数据,耗时 {(time.time()-start_time)/60:.1f} 分钟")
</code>Important Notes
Replace all placeholder values (cookies, srcid, city/region) with your own credentials obtained after logging into Beike.
The scraper relies on the current HTML structure; any changes to class names or page layout will require code updates.
Adjust time.sleep intervals to balance speed and anti‑scraping risk.
Python Programming Learning Circle
A global community of Chinese Python developers offering technical articles, columns, original video tutorials, and problem sets. Topics include web full‑stack development, web scraping, data analysis, natural language processing, image processing, machine learning, automated testing, DevOps automation, and big data.
How this landed with the community
Was this worth your time?
0 Comments
Thoughtful readers leave field notes, pushback, and hard-won operational detail here.