Big Data 12 min read

Python Data‑Analysis Project: Scraping, Cleaning, Visualizing and Word‑Clouding Rental‑Apartment Complaints

This tutorial demonstrates a complete Python data‑analysis workflow that scrapes complaint records for a rental‑apartment company, cleans and merges the data, visualizes complaint trends, and generates word‑clouds to reveal common issues and user demands.

Python Programming Learning Circle
Python Programming Learning Circle
Python Programming Learning Circle
Python Data‑Analysis Project: Scraping, Cleaning, Visualizing and Word‑Clouding Rental‑Apartment Complaints

The article presents a step‑by‑step data‑analysis case study that starts with web‑scraping complaint data from the Black Cat (黑猫) platform for a long‑term rental apartment brand, proceeds to data cleaning and merging, visualizes temporal complaint patterns, and finally creates word‑clouds to highlight frequent complaint topics and user demands.

1. Data Scraping

<code>import requests, time
import pandas as pd
import numpy as np
requests.packages.urllib3.disable_warnings()  # suppress HTTPS warnings
from fake_useragent import UserAgent  # generate random User‑Agent header

def request_data_uid(req_s, couid, page, total_page):
    params = {
        'couid': couid,  # merchant ID
        'type': '1',
        'page_size': page * 10,  # 10 items per page
        'page': page,
    }
    print(f"正在爬取第{page}页,共计{total_page}页,剩余{total_page-page}页")
    url = 'https://tousu.sina.com.cn/api/company/received_complaints'
    header = {'user-agent': UserAgent().random}
    res = req_s.get(url, headers=header, params=params, verify=False)
    info_list = res.json()['result']['data']['complaints']
    result = []
    for info in info_list:
        _data = info['main']
        timestamp = float(_data['timestamp'])
        date = time.strftime("%Y-%m-%d", time.localtime(timestamp))
        data = [date, _data['sn'], _data['title'], _data['appeal'], _data['summary']]
        result.append(data)
    pd_result = pd.DataFrame(result, columns=["投诉日期", "投诉编号", "投诉问题", "投诉诉求", "详细说明"])
    return pd_result

def request_data_keywords(req_s, keyword, page, total_page):
    params = {
        'keywords': keyword,  # search keyword
        'type': '1',
        'page_size': page * 10,
        'page': page,
    }
    print(f"正在爬取第{page}页,共计{total_page}页,剩余{total_page-page}页")
    url = 'https://tousu.sina.com.cn/api/index/s?'
    header = {'user-agent': UserAgent().random}
    res = req_s.get(url, headers=header, params=params, verify=False)
    info_list = res.json()['result']['data']['lists']
    result = []
    for info in info_list:
        _data = info['main']
        timestamp = float(_data['timestamp'])
        date = time.strftime("%Y-%m-%d", time.localtime(timestamp))
        data = [date, _data['sn'], _data['title'], _data['appeal'], _data['summary']]
        result.append(data)
    pd_result = pd.DataFrame(result, columns=["投诉日期", "投诉编号", "投诉问题", "投诉诉求", "详细说明"])
    return pd_result

req_s = requests.Session()
result = pd.DataFrame()
# Scrape data for the brand with UID
total_page = 2507
for page in range(1, total_page + 1):
    data = request_data_uid(req_s, '5350527288', page, total_page)
    result = result.append(data)
result['投诉对象'] = "某壳公寓"
result.to_csv("某壳公寓投诉数据.csv", index=False)

# Scrape keyword‑based data for the related company name
result = pd.DataFrame()
total_page = 56
for page in range(1, total_page + 1):
    data = request_data_keywords(req_s, '某梧桐', page, total_page)
    result = result.append(data)
result['投诉对象'] = "某梧桐"
result.to_csv("某梧桐投诉数据.csv", index=False)</code>

2. Data Cleaning and Merging

<code>import os, re
import pandas as pd
import numpy as np

# Clean keyword‑scraped complaint titles
data_path = os.path.join('data', '某梧桐投诉数据.csv')
data = pd.read_csv(data_path)
pattern = r'[^\u4e00-\u9fa5\d]'
data['投诉问题'] = data['投诉问题'].apply(lambda x: re.sub(pattern, '', x))
data.to_csv(data_path, index=False, encoding='utf_8_sig')

# Merge all CSV files in the data folder
result = pd.DataFrame()
for wj in os.listdir('data'):
    data_path = os.path.join('data', wj)
    data = pd.read_csv(data_path)
    result = result.append(data)
result.to_csv('data/合并后某壳投诉数据.csv', index=False, encoding='utf_8_sig')

# Filter to a specific date range for analysis
data = pd.read_csv('data/合并后某壳投诉数据.csv')
data = data[data.投诉日期 <= '2020-11-09']
print(f"截至2020-11-09之前,黑猫投诉累计收到某壳公寓相关投诉共计 {len(data)} 条")</code>

3. Temporal Distribution Analysis and Plotting

<code># Group by date and count complaints
_data = data.groupby('投诉日期').count().reset_index()[['投诉日期', '投诉编号']]
_data.rename(columns={"投诉编号": "投诉数量"}, inplace=True)

# Summarize different periods
num1 = _data[_data.投诉日期 <= '2020-01-30'].投诉数量.sum()
data0 = pd.DataFrame([['2020-01-30之前', num1]], columns=['投诉日期', '投诉数量'])

data1 = _data[(_data.投诉日期 >= '2020-02-01') & (_data.投诉日期 <= '2020-02-21')]
num2 = _data[(_data.投诉日期 >= '2020-02-21') & (_data.投诉日期 <= '2020-11-05')].投诉数量.sum()

data2 = _data[(_data.投诉日期 > '2020-11-06') & (_data.投诉日期 <= '2020-11-09')]

data3 = pd.DataFrame([['2020-02-21 ~ 2020-11-05', num2]], columns=['投诉日期', '投诉数量'])
new_data = pd.concat([data0, data1, data3, data2])

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.size'] = 18
plt.rcParams['figure.figsize'] = (12, 8)
plt.style.use('ggplot')
new_data.set_index('投诉日期').plot(kind='bar')  # bar chart of complaint counts</code>

The analysis shows a modest baseline before 2020‑01‑30, a sharp increase in February 2020 (likely related to the pandemic and rental‑subsidy issues), a relatively stable period until early November, and an outlier spike on 2020‑11‑06 caused by a legal execution event.

4. Word‑Cloud Generation

<code>import jieba  # Chinese word segmentation
import re, collections
from wordcloud import WordCloud
import PIL.Image as img

# Build a single string of all complaint details
all_word = ''
for line in data.values:
    word = line[4]  # detailed description column
    all_word += word
result = list(jieba.cut(all_word))
wordcloud = WordCloud(width=800, height=600, background_color='white',
                      font_path='C:\\Windows\\Fonts\\msyh.ttc',
                      max_font_size=500, min_font_size=20).generate(' '.join(result))
wordcloud.to_file('某壳公寓投诉详情.png')

# Repeat for complaint titles (column index 2) and appeals (column index 3)
for idx, out_file in [(2, '某壳公寓投诉问题.png'), (3, '某壳公寓投诉诉求.png')]:
    all_word = ''
    for line in data.values:
        all_word += line[idx]
    result = list(jieba.cut(all_word))
    wc = WordCloud(width=800, height=600, background_color='white',
                   font_path='C:\\Windows\\Fonts\\msyh.ttc',
                   max_font_size=500, min_font_size=20).generate(' '.join(result))
    wc.to_file(out_file)
</code>

The resulting word‑clouds highlight that the most frequent complaint topics are cash‑withdrawal issues, delayed promotional rebates, unreachable customer service, and cleaning problems, while the dominant user demand is a refund or compensation for the alleged misconduct of the rental‑apartment operator.

Pythondata cleaningData VisualizationWeb Scrapingpandasword cloudComplaint Analysis
Python Programming Learning Circle
Written by

Python Programming Learning Circle

A global community of Chinese Python developers offering technical articles, columns, original video tutorials, and problem sets. Topics include web full‑stack development, web scraping, data analysis, natural language processing, image processing, machine learning, automated testing, DevOps automation, and big data.

0 followers
Reader feedback

How this landed with the community

login Sign in to like

Rate this article

Was this worth your time?

Sign in to rate
Discussion

0 Comments

Thoughtful readers leave field notes, pushback, and hard-won operational detail here.