本文共 5711 字,大约阅读时间需要 19 分钟。
首先,我们引入并加载所需的数据集。
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snscalendar = pd.read_csv("./data/madrid-airbnb-data/calendar.csv")calendar['price'] = calendar['price'].str.replace(r"[$,]","",regex=True).astype(np.float32)calendar['adjusted_price'] = calendar['adjusted_price'].str.replace(r"[$,]","",regex=True).astype(np.float32)calendar['date'] = pd.to_datetime(calendar['date']) 接下来,我们可以通过以下步骤进行分析:
calendar['weekday'] = calendar['date'].dt.weekdaycalendar['month'] = calendar['date'].dt.monthmonth_price = calendar.groupby("month")['price'].mean()sns.barplot(month_price.index,month_price.values)weekday_price = calendar.groupby("weekday")['price'].mean()sns.barplot(weekday_price.index,weekday_price.values) 对于房源数据,我们进行以下处理:
listings_detailed = pd.read_csv("./data/madrid-airbnb-data/listings_detailed.csv")listings_detailed['price'] = listings_detailed['price'].str.replace(r"[$,]","",regex=True).astype(np.float32)listings_detailed['cleaning_fee'] = listings_detailed['cleaning_fee'].str.replace(r"[$,]","",regex=True).astype(np.float32)listings_detailed['cleaning_fee'].fillna(0,inplace=True)listings_detailed['minimum_cost'] = (listings_detailed['price']+listings_detailed['cleaning_fee'])*listings_detailed['minimum_nights']listings_detailed['n_amenities'] = listings_detailed['amenities'].str[1:-1].str.split(",").apply(len)listings_detailed['accommodates_type'] = pd.cut(listings_detailed['accommodates'],bins=[1,2,3,5,100],include_lowest=True,right=False,labels=['Single','Couple','Family','Group']) 我们可以通过以下分析房型和社区的情况:
room_type_counts = listings_detailed['room_type'].value_counts()axes = plt.subplots(1,2,figsize=(10,5))axes[0].pie(room_type_counts.values, autopct="%.2f%%", labels=room_type_counts.index)sns.barplot(room_type_counts.index, room_type_counts.values)plt.tight_layout()neighbourhood_counts = listings_detailed['neighbourhood_group_cleansed'].value_counts()sns.barplot(y=neighbourhood_counts.index, x=neighbourhood_counts.values, orient='h')
对于房东的房源数量,我们可以进行聚类分析:
host_number = listings_detailed.groupby('host_id').size()sns.distplot(host_number[host_number < 10])host_number_bins = pd.cut(host_number, bins=[1,2,3,5,1000], include_lowest=True, right=False, labels=['1','2','3-4','5+'])plt.pie(host_number_bins.value_counts(), autopct="%.2f%%", labels=host_number_bins.index) 分析评论数据时,我们可以按以下步骤进行:
reviews = pd.read_csv("./data/madrid-airbnb-data/reviews_detailed.csv", parse_dates=['date'])reviews['year'] = reviews['date'].dt.yearreviews['month'] = reviews['date'].dt.monthn_reviews_year = reviews.groupby("year").size()sns.barplot(n_reviews_year.index, n_reviews_year.values)n_reviews_month = reviews.groupby("month").size()sns.barplot(n_reviews_month.index, n_reviews_month.values)year_month_reviews = reviews.groupby(['year','month']).size().unstack("month").fillna(0) 针对房源价格预测,我们采用以下方法:
from sklearn.preprocessing import StandardScalerfrom sklearn.model_selection import train_test_splitfrom sklearn.linear_model import LinearRegressionfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.metrics import mean_absolute_error, r2_scoreml_listings = listings_detailed[listings_detailed['price'] < 300][[ 'host_is_superhost', 'host_identity_verified', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'cleaning_fee', 'minimum_nights', 'maximum_nights', 'availability_90', 'number_of_reviews']]ml_listings.dropna(axis=0, inplace=True)features = ml_listings.drop(columns=['price'])targets = ml_listings['price']disperse_columns = [ 'host_is_superhost', 'host_identity_verified', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'is_business_travel_ready']disperse_features = pd.get_dummies(features[disperse_columns])continuouse_features = features.drop(disperse_columns)scaler = StandardScaler()continuouse_features = scaler.fit_transform(continuouse_features)feature_array = np.hstack([disperse_features, continuouse_features])
from sklearn.model_selection import train_test_splitfrom sklearn.linear_model import LinearRegressionfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.metrics import mean_absolute_error, r2_scoreX_train, X_test, y_train, y_test = train_test_split(feature_array, targets, test_size=0.25)regressor = RandomForestRegressor(n_estimators=100)regressor.fit(X_train, y_train)y_predict = regressor.predict(X_test)print("平均误差:", mean_absolute_error(y_test, y_predict))print("R²评分:", r2_score(y_test, y_predict)) 针对评论数量预测,我们采用以下方法:
ym_reviews = reviews.groupby(['year', 'month']).size().reset_index().rename(columns={0: "count"})features = ym_reviews[['year', 'month']]targets = ym_reviews['count']regressor = RandomForestRegressor(n_estimators=100)regressor.fit(features, targets)y_predict = regressor.predict([[2019, 10], [2019, 11], [2019, 12]])print("平均误差:", mean_absolute_error([2019, 10], y_predict))print("R²评分:", r2_score([2019, 10], y_predict)) 最后,我们可以通过以下方式可视化预测结果:
from pandas import DataFramepredict_reviews = DataFrame([[2019, 10 + index, x] for index, x in enumerate(y_predict)], columns=['year', 'month', 'count'])final_reviews = pd.concat([ym_reviews, predict_reviews]).reset_index()years = final_reviews['year'].unique()fig, ax = plt.subplots(figsize=(10, 5))for year in years: df = final_reviews[final_reviews['year'] == year] sns.lineplot(x="month", y='count', data=df)ax.legend(labels=year_month_reviews.index)ax.grid()_= ax.set_xticks(list(range(1, 13)))
通过以上分析,我们可以深入了解Airbnb数据集中的房源价格、房型分布、社区影响以及房东行为特征,从而为房源定价和运营决策提供重要参考。
转载地址:http://mzdtz.baihongyu.com/