In [31]:
from pathlib import Path
import pandas as pd
from datetime import datetime as dt
import json
from colorama import Fore, Style, init
In [32]:
filename = Path('html/20240322.html')
In [33]:
with open(filename, 'r', encoding='utf-8') as f:
html = f.read()
In [34]:
# import requests
# from lxml import etree
# import random
# import os
In [35]:
# class CodeSpider(object):
# def __init__(self):
# self.url = 'https://m.anjuke.com/sh/trendency/'
# self.ua_list = [
# 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
# 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
# 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .\
# NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)', ]
# def parse_html(self):
# # 获取响应内容
# html = requests.get(url=self.url, headers={'User-Agent': random.choice(self.ua_list)})
# html = html.content.decode('utf-8', 'ignore')
# # parse_html = etree.HTML(html)
# return html
In [36]:
# spider = CodeSpider()
# html = spider.parse_html()
In [37]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
In [38]:
# print(soup.prettify())
In [39]:
rate_items = soup.find(id="__next").find("div", "container").find("div", "content").find("div", "rate").find("div", "divide-list").find_all("a", "rate-item")
In [40]:
price_list = []
In [41]:
for item in rate_items:
price = item.find("div", "name").text, item.find("div", "price").find("span", "price-num").text, item.find("div", "price").find("span", "price-unit").text
price_list.append(price)
print(price)
In [42]:
df = pd.DataFrame(price_list, columns=['district', 'price', 'unit'])
In [43]:
df.to_csv(f"data/house_price_{filename.stem}.csv", index=False, header=True)
In [44]:
prices = []
for dist in ['嘉定', '青浦', '松江', '宝山', '闵行']:
for price in price_list:
if price[0] in dist:
prices.append(price[1])
break
In [45]:
print("\t".join(prices))
In [ ]: