0%

pandas实现SQL

LeetCode第1645题为例。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
WITH recursive months AS (
SELECT
1 AS month
UNION ALL
SELECT
month + 1
FROM
months
WHERE
month < 12
)
SELECT
m.month AS month,
ifnull(
round(
(
count(
DISTINCT(t.driver_id)
) / (
SELECT
count(driver_id)
FROM
drivers
WHERE
join_date < date_add(
'2020-1-1', INTERVAL m.month month
)
) * 100
),
2
),
0.00
) working_percentage
FROM
(
SELECT
r.ride_id,
r.requested_at requested_date,
ar.ride_id accepted_ride,
ar.driver_id
FROM
rides r
RIGHT JOIN acceptedrides ar ON r.ride_id = ar.ride_id
WHERE
r.requested_at < '2021-01-01'
AND r.requested_at > '2019-12-31'
ORDER BY
r.requested_at
) t
RIGHT JOIN months m ON m.month = month(t.requested_date)
GROUP BY
m.month;

Result table:

month working_percentage
1 0.00
2 0.00
3 25.00
4 0.00
5 0.00
6 20.00
7 20.00
8 20.00
9 0.00
10 0.00
11 33.33
12 16.67

SQL to CSV

jupyter notebook

1
import pandas as pd
1
rides = pd.read_csv('Rides.csv')
1
accepted_rides = pd.read_csv('AcceptedRides.csv')
1
drivers = pd.read_csv('Drivers.csv')
1
drivers

driver_id join_date
0 10 2019-12-10
1 8 2020-1-13
2 5 2020-2-16
3 7 2020-3-8
4 4 2020-5-17
5 1 2020-10-24
6 6 2021-1-5
1
rides

ride_id user_id requested_at
0 6 75 2019-12-9
1 1 54 2020-2-9
2 10 63 2020-3-4
3 19 39 2020-4-6
4 3 41 2020-6-3
5 13 52 2020-6-22
6 7 69 2020-7-16
7 17 70 2020-8-25
8 20 81 2020-11-2
9 5 57 2020-11-9
10 2 42 2020-12-9
11 11 68 2021-1-11
12 15 32 2021-1-17
13 12 11 2021-1-19
14 14 18 2021-1-27
1
accepted_rides

ride_id driver_id ride_distance ride_duration
0 10 10 63 38
1 13 10 73 96
2 7 8 100 28
3 17 7 119 68
4 20 1 121 92
5 5 7 42 101
6 2 4 6 38
7 11 8 37 43
8 15 8 108 82
9 12 8 38 34
10 14 1 90 74

先按SQL的逻辑,用pandas做一下:

1.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
    SELECT 
r.ride_id,
r.requested_at requested_date,
ar.ride_id accepted_ride,
ar.driver_id
FROM
rides r
RIGHT JOIN acceptedrides ar ON r.ride_id = ar.ride_id
WHERE
r.requested_at < '2021-01-01'
AND r.requested_at > '2019-12-31'
ORDER BY
r.requested_at
```


```python
accepted_rides_info = pd.merge(
rides,
accepted_rides,
how="right",
on=["ride_id", "ride_id"]
)
1
accepted_rides_info

ride_id user_id requested_at driver_id ride_distance ride_duration
0 10 63 2020-3-4 10 63 38
1 13 52 2020-6-22 10 73 96
2 7 69 2020-7-16 8 100 28
3 17 70 2020-8-25 7 119 68
4 20 81 2020-11-2 1 121 92
5 5 57 2020-11-9 7 42 101
6 2 42 2020-12-9 4 6 38
7 11 68 2021-1-11 8 37 43
8 15 32 2021-1-17 8 108 82
9 12 11 2021-1-19 8 38 34
10 14 18 2021-1-27 1 90 74
1
import datetime
1
2
3
t = accepted_rides_info[(pd.to_datetime(accepted_rides_info['requested_at'])>pd.to_datetime('2019-12-31'))
& (pd.to_datetime(accepted_rides_info['requested_at'])<pd.to_datetime('2021-01-01'))
][['ride_id', 'driver_id', 'requested_at']]
1
t

ride_id driver_id requested_at
0 10 10 2020-3-4
1 13 10 2020-6-22
2 7 8 2020-7-16
3 17 7 2020-8-25
4 20 1 2020-11-2
5 5 7 2020-11-9
6 2 4 2020-12-9
1
requested_months = pd.DatetimeIndex(t['requested_at']).month
1
t['month'] = requested_months
1
t

ride_id driver_id requested_at month
0 10 10 2020-3-4 3
1 13 10 2020-6-22 6
2 7 8 2020-7-16 7
3 17 7 2020-8-25 8
4 20 1 2020-11-2 11
5 5 7 2020-11-9 11
6 2 4 2020-12-9 12
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
def get_percent(m):
# select count(distinct driver_id) where month=m from t;
a = t[t.month==m].agg(dict(driver_id=pd.Series.nunique))
if len(a) == 0:
a = 0
else:
a = a[0]
if m < 12:
# SELECT count(driver_id) FROM drivers WHERE join_date < 2020-m+1-1:
d = drivers[pd.to_datetime(drivers['join_date']) <
pd.to_datetime('2020-'+str(m+1)+'-1')].agg(dict(driver_id=pd.Series.nunique))[0]
else:
# SELECT count(driver_id) FROM drivers WHERE join_date < 2020-m+1-1:
d = drivers[pd.to_datetime(drivers['join_date']) <
pd.to_datetime('2021-1-1')].agg(dict(driver_id=pd.Series.nunique))[0]

return a/d
1
get_percent(3)
0.25
1
2
3
4
5
6
7
months = []
percentage = []
for i in range(1, 13):
months.append(i)
percentage.append(get_percent(i))

result = pd.DataFrame({'month': months, 'percentage': percentage})
1
result

month percentage
0 1 0.000000
1 2 0.000000
2 3 0.250000
3 4 0.000000
4 5 0.000000
5 6 0.200000
6 7 0.200000
7 8 0.200000
8 9 0.000000
9 10 0.000000
10 11 0.333333
11 12 0.166667