Files
Obsidian/Extras/Omnivore/数据处理过程.md

241 lines
8.9 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

---
aliases:
atlas: "[[Atlas/Card|Card]]"
created: 2024-03-15 21:36:51
modified: 2024-03-15 21:55:16
tags:
title: 数据处理过程
---
# 数据清洗
通过编写 Python 代码将对接人所提供的所有业务明细单整合为一个文件以便后续数据分析的进行
```python
import os
import pandas as pd
# The directory containing your Excel files
directory = 'E:/Projects/analyse'
# List to hold data from each file
all_data = []
# Loop through each file in the directory
for filename in os.listdir(directory):
if filename.endswith('.xlsx') or filename.endswith('.xls'):
file_path = os.path.join(directory, filename)
df = pd.read_excel(file_path)
all_data.append(df)
# Concatenate all data into a single DataFrame
merged_data = pd.concat(all_data, ignore_index=True)
# Save the merged DataFrame to a new Excel file
merged_data.to_excel('merged_data.xlsx', index=False)
print("Files have been merged and saved as 'merged_data.xlsx'")
```
删去了以下列:序号、服务单号、调度单号、联系人、联系电话、患者信息、销售、介绍人、客服、调度、来源、承包组、车牌、出车成员、医护出车和任务备注
经过确认,所有调度单状态不为已返回的订单均未产生收入,故将其全部筛选出来后将总成交价一列的数值改为 0 以免影响计算结果,统计后月营收额如下所示:
| 日期 | 2022-04 | 2022-05 | 2022-06 | 2022-07 | 2022-08 | 2022-09 | 2022-10 | 2022-11 | 2022-12 | 2023-01 | 2023-02 | 2023-03 | 2023-04 | 2023-05 | 2023-06 | 2023-07 | 2023-08 | 2023-09 | 2023-10 | 2023-11 | 2023-12 | 2024-01 | 2024-02 |
| --- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- | ---------- |
| 营收额 | 3328917.00 | 3362286.00 | 3973152.00 | 3462363.00 | 4250864.00 | 4144810.76 | 4360712.00 | 4587020.00 | 4880988.50 | 4197830.00 | 3309294.00 | 3338335.00 | 4069565.00 | 4292058.60 | 3101339.20 | 3834394.40 | 3114722.80 | 2750602.00 | 4161377.40 | 3465051.00 | 2898861.00 | 3426260.50 | 3559553.15 |
# 数据分析
## 月营收趋势
```python
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# Load the Excel file
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')
# Convert '日期' to datetime format and '总成交价' to numeric
data['日期'] = pd.to_datetime(data['日期'])
data['总成交价'] = pd.to_numeric(data['总成交价'], errors='coerce')
# Add a column for the year and month for easier analysis
data['YearMonth'] = data['日期'].dt.to_period('M')
# Summarize monthly revenue
monthly_revenue = data.groupby('YearMonth')['总成交价'].sum().reset_index()
plt.figure(figsize=(14, 7))
plt.plot(monthly_revenue['YearMonth'].astype(str), monthly_revenue['总成交价'], marker='o')
plt.title('月营收趋势')
plt.xlabel('月份')
plt.ylabel('收入')
plt.xticks(rotation=45)
plt.grid(visible=True)
plt.tight_layout()
plt.show()
```
![image.png|600](https://image.kfdr.top/i/2024/03/16/65f5628e38f6e.png)
## 平均客单价
为避免极端值影响,先按月份将所有数据分组,剔除前 1%和后 1%的订单后再计算平均客单价
```python
# Attempting the analysis again with additional checks
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# Load the Excel file
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')
# Ensure '日期' is in datetime format for grouping
data['日期'] = pd.to_datetime(data['日期'])
# Add a 'YearMonth' column for easier analysis
data['YearMonth'] = data['日期'].dt.to_period('M')
# Group data by 'YearMonth'
grouped = data.groupby('YearMonth')
# Function to remove the top 1% and bottom 1% within each group
def remove_outliers(group):
lower = group['总成交价'].quantile(0.01)
upper = group['总成交价'].quantile(0.99)
return group[(group['总成交价'] > lower) & (group['总成交价'] < upper)]
# Apply the function to each group
filtered_groups = grouped.apply(remove_outliers)
# Reset index as the grouping operation might introduce a multi-level index
filtered_groups = filtered_groups.reset_index(drop=True)
# Group by 'YearMonth' again after filtering and calculate the average price
average_price_filtered = filtered_groups.groupby('YearMonth')['总成交价'].mean().reset_index()
# Convert 'YearMonth' to string for plotting
average_price_filtered['YearMonth'] = average_price_filtered['YearMonth'].astype(str)
# Plotting the result
plt.figure(figsize=(14, 7))
plt.plot(average_price_filtered['YearMonth'], average_price_filtered['总成交价'], marker='o', linestyle='-',
color='red')
plt.title('平均客单价每月数据去除前1%和后1%')
plt.xlabel('月份')
plt.ylabel('价格')
plt.xticks(rotation=45)
plt.grid(visible=True)
plt.tight_layout()
plt.show()
```
![image.png|600](https://image.kfdr.top/i/2024/03/16/65f55b0a3adbf.png)
## 业务时段分布
查看一天二十四小时中的业务分布情况
```python
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# Load the Excel file
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')
# Extracting hour from the '时间' column to analyze service demand by time of day
data['Hour'] = data['时间'].str.extract('(\d+):').astype(int)
# Analyzing service demand by hour
service_demand_by_hour = data.groupby('Hour')['日期'].count().reset_index()
# Plotting service demand by hour
plt.figure(figsize=(12, 6))
plt.bar(service_demand_by_hour['Hour'], service_demand_by_hour['日期'], color='orange')
plt.title('业务时段分布')
plt.xlabel('24 小时')
plt.ylabel('业务频次')
plt.xticks(range(0, 24))
plt.grid(axis='y')
plt.show()
```
![image.png|600](https://image.kfdr.top/i/2024/03/16/65f56155bee8d.png)
## 白班夜班比例
```python
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# Load the Excel file
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')
# Ensure '日期' is in datetime format for grouping
data['日期'] = pd.to_datetime(data['日期'])
# Add a 'YearMonth' column for easier analysis
data['YearMonth'] = data['日期'].dt.to_period('M')
# Calculate the ratio of day and night shifts
shift_ratio = data['班次'].value_counts()
# Generate a pie chart to show the ratio of day and night shifts
plt.figure(figsize=(8, 8))
plt.pie(shift_ratio, labels=shift_ratio.index, autopct='%1.1f%%', startangle=140, colors=['lightblue', 'lightgreen'])
plt.title('白班和夜班的比例')
plt.show()
# Calculate the volume of day and night shifts by month
shift_volume_by_month = data.groupby(['YearMonth', '班次'])['日期'].count().unstack(fill_value=0)
# Generate a bar chart to show the volume of day and night shifts by month
shift_volume_by_month.plot(kind='bar', stacked=True, figsize=(14, 7), color=['lightblue', 'lightgreen'])
plt.title('Volume of Day and Night Shifts by Month')
plt.xlabel('Year-Month')
plt.ylabel('Number of Shifts')
plt.xticks(rotation=45)
plt.legend(title='Shift')
plt.tight_layout()
plt.show()
```
![image.png|600](https://image.kfdr.top/i/2024/03/16/65f5617bb2942.png)
![image.png|600](https://image.kfdr.top/i/2024/03/16/65f5625b9cac2.png)
## 业务区域分布
```python
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# Load the Excel file
data = pd.read_excel('E:/Projects/analyse/pythonProject/merged_data.xlsx')
# Correcting the approach based on the updated description for the '区域' column
# Update the DataFrame to reflect the correct column name and values for categorization
data['Regional Category'] = data['区域'].map({'市内': '省内', '广东省内': '省内', '国际': '省外', '港澳台': '省外', '广东省外': '省外'})
# Calculate the distribution of the new categories
regional_category_distribution = data['Regional Category'].value_counts()
# Generate a pie chart to show the updated regional distribution of the business
plt.figure(figsize=(8, 8))
plt.pie(regional_category_distribution, labels=regional_category_distribution.index, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'orange'])
plt.title('业务区域分布')
plt.show()
```
![image.png|600](https://image.kfdr.top/i/2024/03/16/65f568159bb5c.png)