Files
Obsidian/Extras/Omnivore/数据处理过程.md

7.8 KiB
Raw Blame History

aliases, atlas, created, modified, tags, title
aliases atlas created modified tags title
Atlas/Card 2024-03-15 21:36:51 2024-03-15 21:55:16 数据处理过程

数据清洗

通过编写 Python 代码将对接人所提供的所有业务明细单整合为一个文件以便后续数据分析的进行

import os  
import pandas as pd  
  
# The directory containing your Excel files  
directory = 'E:/Projects/analyse'  
  
# List to hold data from each file  
all_data = []  
  
# Loop through each file in the directory  
for filename in os.listdir(directory):  
    if filename.endswith('.xlsx') or filename.endswith('.xls'):  
        file_path = os.path.join(directory, filename)  
        df = pd.read_excel(file_path)  
        all_data.append(df)  
  
# Concatenate all data into a single DataFrame  
merged_data = pd.concat(all_data, ignore_index=True)  
  
# Save the merged DataFrame to a new Excel file  
merged_data.to_excel('merged_data.xlsx', index=False)  
  
print("Files have been merged and saved as 'merged_data.xlsx'")

删去了以下列:序号、服务单号、调度单号、联系人、联系电话、患者信息、销售、介绍人、客服、调度、来源、承包组、车牌、出车成员、医护出车和任务备注

经过确认,所有调度单状态不为已返回的订单均未产生收入,故将其全部筛选出来后将总成交价一列的数值改为 0 以免影响计算结果,统计后月营收额如下所示:

日期 2022-04 2022-05 2022-06 2022-07 2022-08 2022-09 2022-10 2022-11 2022-12 2023-01 2023-02 2023-03 2023-04 2023-05 2023-06 2023-07 2023-08 2023-09 2023-10 2023-11 2023-12 2024-01 2024-02
营收额 3328917.00 3362286.00 3973152.00 3462363.00 4250864.00 4144810.76 4360712.00 4587020.00 4880988.50 4197830.00 3309294.00 3338335.00 4069565.00 4292058.60 3101339.20 3834394.40 3114722.80 2750602.00 4161377.40 3465051.00 2898861.00 3426260.50 3559553.15

数据分析

月营收趋势

import pandas as pd  
import matplotlib.pyplot as plt  
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  
  
# Load the Excel file  
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')  
  
# Convert '日期' to datetime format and '总成交价' to numeric  
data['日期'] = pd.to_datetime(data['日期'])  
data['总成交价'] = pd.to_numeric(data['总成交价'], errors='coerce')  
  
# Add a column for the year and month for easier analysis  
data['YearMonth'] = data['日期'].dt.to_period('M')  
  
# Summarize monthly revenue  
monthly_revenue = data.groupby('YearMonth')['总成交价'].sum().reset_index()  
  
plt.figure(figsize=(14, 7))  
plt.plot(monthly_revenue['YearMonth'].astype(str), monthly_revenue['总成交价'], marker='o')  
plt.title('月营收趋势')  
plt.xlabel('月份')  
plt.ylabel('收入')  
plt.xticks(rotation=45)  
plt.grid(visible=True)  
plt.tight_layout()  
plt.show()

image.png|600

平均客单价

为避免极端值影响,先按月份将所有数据分组,剔除前 1%和后 1%的订单后再计算平均客单价

# Attempting the analysis again with additional checks  
import pandas as pd  
import matplotlib.pyplot as plt  
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  
  
# Load the Excel file  
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')  
  
# Ensure '日期' is in datetime format for grouping  
data['日期'] = pd.to_datetime(data['日期'])  
  
# Add a 'YearMonth' column for easier analysis  
data['YearMonth'] = data['日期'].dt.to_period('M')  
  
# Group data by 'YearMonth'  
grouped = data.groupby('YearMonth')  
  
  
# Function to remove the top 1% and bottom 1% within each group  
def remove_outliers(group):  
    lower = group['总成交价'].quantile(0.01)  
    upper = group['总成交价'].quantile(0.99)  
    return group[(group['总成交价'] > lower) & (group['总成交价'] < upper)]  
  
  
# Apply the function to each group  
filtered_groups = grouped.apply(remove_outliers)  
  
# Reset index as the grouping operation might introduce a multi-level index  
filtered_groups = filtered_groups.reset_index(drop=True)  
  
# Group by 'YearMonth' again after filtering and calculate the average price  
average_price_filtered = filtered_groups.groupby('YearMonth')['总成交价'].mean().reset_index()  
  
# Convert 'YearMonth' to string for plotting  
average_price_filtered['YearMonth'] = average_price_filtered['YearMonth'].astype(str)  
  
# Plotting the result  
plt.figure(figsize=(14, 7))  
plt.plot(average_price_filtered['YearMonth'], average_price_filtered['总成交价'], marker='o', linestyle='-',  
         color='red')  
plt.title('平均客单价每月数据去除前1%和后1%')  
plt.xlabel('月份')  
plt.ylabel('价格')  
plt.xticks(rotation=45)  
plt.grid(visible=True)  
plt.tight_layout()  
plt.show()

image.png|600

业务时段分布

查看一天二十四小时中的业务分布情况

import pandas as pd  
import matplotlib.pyplot as plt  
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  
  
# Load the Excel file  
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')  
  
# Extracting hour from the '时间' column to analyze service demand by time of day  
data['Hour'] = data['时间'].str.extract('(\d+):').astype(int)  
  
# Analyzing service demand by hour  
service_demand_by_hour = data.groupby('Hour')['日期'].count().reset_index()  
  
# Plotting service demand by hour  
plt.figure(figsize=(12, 6))  
plt.bar(service_demand_by_hour['Hour'], service_demand_by_hour['日期'], color='orange')  
plt.title('业务时段分布')  
plt.xlabel('24 小时')  
plt.ylabel('业务频次')  
plt.xticks(range(0, 24))  
plt.grid(axis='y')    
plt.show()  

image.png|600

白班夜班比例

import pandas as pd  
import matplotlib.pyplot as plt  
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  
  
# Load the Excel file  
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')  
  
# Ensure '日期' is in datetime format for grouping  
data['日期'] = pd.to_datetime(data['日期'])  
  
# Add a 'YearMonth' column for easier analysis  
data['YearMonth'] = data['日期'].dt.to_period('M')  
  
# Calculate the ratio of day and night shifts  
shift_ratio = data['班次'].value_counts()  
  
# Generate a pie chart to show the ratio of day and night shifts  
plt.figure(figsize=(8, 8))  
plt.pie(shift_ratio, labels=shift_ratio.index, autopct='%1.1f%%', startangle=140, colors=['lightblue', 'lightgreen'])  
plt.title('白班和夜班的比例')  
plt.show()  
  
# Calculate the volume of day and night shifts by month  
shift_volume_by_month = data.groupby(['YearMonth', '班次'])['日期'].count().unstack(fill_value=0)  
  
# Generate a bar chart to show the volume of day and night shifts by month  
shift_volume_by_month.plot(kind='bar', stacked=True, figsize=(14, 7), color=['lightblue', 'lightgreen'])  
plt.title('Volume of Day and Night Shifts by Month')  
plt.xlabel('Year-Month')  
plt.ylabel('Number of Shifts')  
plt.xticks(rotation=45)  
plt.legend(title='Shift')  
plt.tight_layout()  
plt.show()

image.png|600

image.png|600