0%

house_price
In [31]:
from pathlib import Path
import pandas as pd
from datetime import datetime as dt
import json
from colorama import Fore, Style, init
In [32]:
filename = Path('html/20240322.html')
In [33]:
with open(filename, 'r', encoding='utf-8') as f:
    html = f.read()
In [34]:
# import requests
# from lxml import etree
# import random
# import os
In [35]:
# class CodeSpider(object):
#     def __init__(self):
#         self.url = 'https://m.anjuke.com/sh/trendency/'
#         self.ua_list = [
#             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
#             'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
#             'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .\
#             NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ]

#     def parse_html(self):
#         # 获取响应内容
#         html = requests.get(url=self.url, headers={'User-Agent': random.choice(self.ua_list)})
#         html = html.content.decode('utf-8', 'ignore')

# #         parse_html = etree.HTML(html)   
#         return html
In [36]:
# spider = CodeSpider()
# html = spider.parse_html()
In [37]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
In [38]:
# print(soup.prettify())
In [39]:
rate_items = soup.find(id="__next").find("div", "container").find("div", "content").find("div", "rate").find("div", "divide-list").find_all("a", "rate-item")
In [40]:
price_list = []
In [41]:
for item in rate_items:
    price = item.find("div", "name").text, item.find("div", "price").find("span", "price-num").text, item.find("div", "price").find("span", "price-unit").text
    price_list.append(price)
    print(price)
('黄浦', '114551', '元/㎡')
('静安', '88836', '元/㎡')
('徐汇', '84283', '元/㎡')
('虹口', '82454', '元/㎡')
('长宁', '80273', '元/㎡')
('杨浦', '79078', '元/㎡')
('普陀', '69637', '元/㎡')
('浦东', '62020', '元/㎡')
('闵行', '59634', '元/㎡')
('宝山', '44509', '元/㎡')
('嘉定', '42082', '元/㎡')
('松江', '41070', '元/㎡')
('青浦', '37422', '元/㎡')
('奉贤', '25138', '元/㎡')
('崇明', '23109', '元/㎡')
('金山', '16916', '元/㎡')
('上海周边', '10241', '元/㎡')
In [42]:
df = pd.DataFrame(price_list, columns=['district', 'price', 'unit'])
In [43]:
df.to_csv(f"data/house_price_{filename.stem}.csv", index=False, header=True)
In [44]:
prices = []
for dist in ['嘉定', '青浦', '松江', '宝山', '闵行']:
    for price in price_list:
        if price[0] in dist:
            prices.append(price[1])
            break
In [45]:
print("\t".join(prices))
42082	37422	41070	44509	59634
In [ ]: