5
5
import pandas as pd
6
6
from pandas .util import testing as tm
7
7
8
- for imp in [' pandas.util' , ' pandas.tools.hashing' ]:
8
+ for imp in [" pandas.util" , " pandas.tools.hashing" ]:
9
9
try :
10
10
hashing = import_module (imp )
11
11
break
15
15
16
16
class Factorize :
17
17
18
- params = [[True , False ], [' int' , ' uint' , ' float' , ' string' ]]
19
- param_names = [' sort' , ' dtype' ]
18
+ params = [[True , False ], [" int" , " uint" , " float" , " string" ]]
19
+ param_names = [" sort" , " dtype" ]
20
20
21
21
def setup (self , sort , dtype ):
22
- N = 10 ** 5
23
- data = {'int' : pd .Int64Index (np .arange (N ).repeat (5 )),
24
- 'uint' : pd .UInt64Index (np .arange (N ).repeat (5 )),
25
- 'float' : pd .Float64Index (np .random .randn (N ).repeat (5 )),
26
- 'string' : tm .makeStringIndex (N ).repeat (5 )}
22
+ N = 10 ** 5
23
+ data = {
24
+ "int" : pd .Int64Index (np .arange (N ).repeat (5 )),
25
+ "uint" : pd .UInt64Index (np .arange (N ).repeat (5 )),
26
+ "float" : pd .Float64Index (np .random .randn (N ).repeat (5 )),
27
+ "string" : tm .makeStringIndex (N ).repeat (5 ),
28
+ }
27
29
self .idx = data [dtype ]
28
30
29
31
def time_factorize (self , sort , dtype ):
@@ -32,15 +34,17 @@ def time_factorize(self, sort, dtype):
32
34
33
35
class FactorizeUnique :
34
36
35
- params = [[True , False ], [' int' , ' uint' , ' float' , ' string' ]]
36
- param_names = [' sort' , ' dtype' ]
37
+ params = [[True , False ], [" int" , " uint" , " float" , " string" ]]
38
+ param_names = [" sort" , " dtype" ]
37
39
38
40
def setup (self , sort , dtype ):
39
- N = 10 ** 5
40
- data = {'int' : pd .Int64Index (np .arange (N )),
41
- 'uint' : pd .UInt64Index (np .arange (N )),
42
- 'float' : pd .Float64Index (np .arange (N )),
43
- 'string' : tm .makeStringIndex (N )}
41
+ N = 10 ** 5
42
+ data = {
43
+ "int" : pd .Int64Index (np .arange (N )),
44
+ "uint" : pd .UInt64Index (np .arange (N )),
45
+ "float" : pd .Float64Index (np .arange (N )),
46
+ "string" : tm .makeStringIndex (N ),
47
+ }
44
48
self .idx = data [dtype ]
45
49
assert self .idx .is_unique
46
50
@@ -50,15 +54,17 @@ def time_factorize(self, sort, dtype):
50
54
51
55
class Duplicated :
52
56
53
- params = [[' first' , ' last' , False ], [' int' , ' uint' , ' float' , ' string' ]]
54
- param_names = [' keep' , ' dtype' ]
57
+ params = [[" first" , " last" , False ], [" int" , " uint" , " float" , " string" ]]
58
+ param_names = [" keep" , " dtype" ]
55
59
56
60
def setup (self , keep , dtype ):
57
- N = 10 ** 5
58
- data = {'int' : pd .Int64Index (np .arange (N ).repeat (5 )),
59
- 'uint' : pd .UInt64Index (np .arange (N ).repeat (5 )),
60
- 'float' : pd .Float64Index (np .random .randn (N ).repeat (5 )),
61
- 'string' : tm .makeStringIndex (N ).repeat (5 )}
61
+ N = 10 ** 5
62
+ data = {
63
+ "int" : pd .Int64Index (np .arange (N ).repeat (5 )),
64
+ "uint" : pd .UInt64Index (np .arange (N ).repeat (5 )),
65
+ "float" : pd .Float64Index (np .random .randn (N ).repeat (5 )),
66
+ "string" : tm .makeStringIndex (N ).repeat (5 ),
67
+ }
62
68
self .idx = data [dtype ]
63
69
# cache is_unique
64
70
self .idx .is_unique
@@ -69,15 +75,17 @@ def time_duplicated(self, keep, dtype):
69
75
70
76
class DuplicatedUniqueIndex :
71
77
72
- params = [' int' , ' uint' , ' float' , ' string' ]
73
- param_names = [' dtype' ]
78
+ params = [" int" , " uint" , " float" , " string" ]
79
+ param_names = [" dtype" ]
74
80
75
81
def setup (self , dtype ):
76
- N = 10 ** 5
77
- data = {'int' : pd .Int64Index (np .arange (N )),
78
- 'uint' : pd .UInt64Index (np .arange (N )),
79
- 'float' : pd .Float64Index (np .random .randn (N )),
80
- 'string' : tm .makeStringIndex (N )}
82
+ N = 10 ** 5
83
+ data = {
84
+ "int" : pd .Int64Index (np .arange (N )),
85
+ "uint" : pd .UInt64Index (np .arange (N )),
86
+ "float" : pd .Float64Index (np .random .randn (N )),
87
+ "string" : tm .makeStringIndex (N ),
88
+ }
81
89
self .idx = data [dtype ]
82
90
# cache is_unique
83
91
self .idx .is_unique
@@ -87,67 +95,74 @@ def time_duplicated_unique(self, dtype):
87
95
88
96
89
97
class Hashing :
90
-
91
98
def setup_cache (self ):
92
- N = 10 ** 5
99
+ N = 10 ** 5
93
100
94
101
df = pd .DataFrame (
95
- {'strings' : pd .Series (tm .makeStringIndex (10000 ).take (
96
- np .random .randint (0 , 10000 , size = N ))),
97
- 'floats' : np .random .randn (N ),
98
- 'ints' : np .arange (N ),
99
- 'dates' : pd .date_range ('20110101' , freq = 's' , periods = N ),
100
- 'timedeltas' : pd .timedelta_range ('1 day' , freq = 's' , periods = N )})
101
- df ['categories' ] = df ['strings' ].astype ('category' )
102
+ {
103
+ "strings" : pd .Series (
104
+ tm .makeStringIndex (10000 ).take (np .random .randint (0 , 10000 , size = N ))
105
+ ),
106
+ "floats" : np .random .randn (N ),
107
+ "ints" : np .arange (N ),
108
+ "dates" : pd .date_range ("20110101" , freq = "s" , periods = N ),
109
+ "timedeltas" : pd .timedelta_range ("1 day" , freq = "s" , periods = N ),
110
+ }
111
+ )
112
+ df ["categories" ] = df ["strings" ].astype ("category" )
102
113
df .iloc [10 :20 ] = np .nan
103
114
return df
104
115
105
116
def time_frame (self , df ):
106
117
hashing .hash_pandas_object (df )
107
118
108
119
def time_series_int (self , df ):
109
- hashing .hash_pandas_object (df [' ints' ])
120
+ hashing .hash_pandas_object (df [" ints" ])
110
121
111
122
def time_series_string (self , df ):
112
- hashing .hash_pandas_object (df [' strings' ])
123
+ hashing .hash_pandas_object (df [" strings" ])
113
124
114
125
def time_series_float (self , df ):
115
- hashing .hash_pandas_object (df [' floats' ])
126
+ hashing .hash_pandas_object (df [" floats" ])
116
127
117
128
def time_series_categorical (self , df ):
118
- hashing .hash_pandas_object (df [' categories' ])
129
+ hashing .hash_pandas_object (df [" categories" ])
119
130
120
131
def time_series_timedeltas (self , df ):
121
- hashing .hash_pandas_object (df [' timedeltas' ])
132
+ hashing .hash_pandas_object (df [" timedeltas" ])
122
133
123
134
def time_series_dates (self , df ):
124
- hashing .hash_pandas_object (df [' dates' ])
135
+ hashing .hash_pandas_object (df [" dates" ])
125
136
126
137
127
138
class Quantile :
128
- params = [[0 , 0.5 , 1 ],
129
- ['linear' , 'nearest' , 'lower' , 'higher' , 'midpoint' ],
130
- ['float' , 'int' , 'uint' ]]
131
- param_names = ['quantile' , 'interpolation' , 'dtype' ]
139
+ params = [
140
+ [0 , 0.5 , 1 ],
141
+ ["linear" , "nearest" , "lower" , "higher" , "midpoint" ],
142
+ ["float" , "int" , "uint" ],
143
+ ]
144
+ param_names = ["quantile" , "interpolation" , "dtype" ]
132
145
133
146
def setup (self , quantile , interpolation , dtype ):
134
- N = 10 ** 5
135
- data = {'int' : np .arange (N ),
136
- 'uint' : np .arange (N ).astype (np .uint64 ),
137
- 'float' : np .random .randn (N )}
147
+ N = 10 ** 5
148
+ data = {
149
+ "int" : np .arange (N ),
150
+ "uint" : np .arange (N ).astype (np .uint64 ),
151
+ "float" : np .random .randn (N ),
152
+ }
138
153
self .idx = pd .Series (data [dtype ].repeat (5 ))
139
154
140
155
def time_quantile (self , quantile , interpolation , dtype ):
141
156
self .idx .quantile (quantile , interpolation = interpolation )
142
157
143
158
144
159
class SortIntegerArray :
145
- params = [10 ** 3 , 10 ** 5 ]
160
+ params = [10 ** 3 , 10 ** 5 ]
146
161
147
162
def setup (self , N ):
148
163
data = np .arange (N , dtype = float )
149
164
data [40 ] = np .nan
150
- self .array = pd .array (data , dtype = ' Int64' )
165
+ self .array = pd .array (data , dtype = " Int64" )
151
166
152
167
def time_argsort (self , N ):
153
168
self .array .argsort ()
0 commit comments