АК
Size: a a a
АК
АК
АК
// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
using namespace Rcpp;
template <typename T>
class hasher {
public:
std::size_t operator()(const T& vec) const {
size_t seed = vec.size();
for(auto& i : vec) {
seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
return seed;
}
};
// [[Rcpp::export]]
size_t count_rows(IntegerMatrix x) {
size_t nrows = x.rows();
hasher<IntegerMatrix::Row> hash_fn;
std::unordered_map<size_t,size_t> hash_rows;
std::unordered_map<size_t,size_t> hash_counts;
using pair_type = decltype(hash_counts)::value_type;
for (size_t i = 0; i < nrows; ++i) {
IntegerMatrix::Row ri = x.row(i);
size_t h = hash_fn(ri);
hash_counts[h]++;
hash_rows[h] = i;
}
auto comp = [](const pair_type& p1, const pair_type& p2) {
return p1.second < p2.second;
};
auto hidx = std::max_element(hash_counts.begin(), hash_counts.end(), comp);
size_t midx = hash_rows[hidx->first];
return midx;
}
/*** R
set.seed(123)
N = 10000
x <- matrix(nrow = N, ncol = 3, data = sample(0:31, N*3, replace = TRUE))
x[count_rows(x),]
*/
АК
f_tabulate = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
# Unit: microseconds
# expr min lq mean median uq max neval
# dt 3098.166 3185.073 3728.0666 3334.8835 4014.0755 12412.863 100
# base 57688.870 58858.048 60201.7815 59508.4385 60817.6605 70619.858 100
# tabulate 314.518 352.260 409.4361 359.3785 364.3445 2914.091 100
АК
which(res == max(res))[1]
which.max
. Он должен быстрее работать.Unit: microseconds
expr min lq mean median uq max neval cld
f_tabulate1(x) 291.056 311.396 497.9079 327.5375 516.369 4679.221 100 a
f_tabulate2(x) 228.249 241.458 364.9199 268.5990 376.807 6679.179 100 a
АК
f_tabulate = function(mx) {
mult = 32
splitter = mx[,3] * mult * mult + mx[,2] * mult + mx[,1]
res = tabulate(splitter)
idx = which.max(splitter == which.max(res))
mx[idx, ]
}
ГД
АК
ГД
set.seed(123)
N = 10000
x <- matrix(nrow = N, ncol = 3, data = sample(0:31, N*3, replace = TRUE))
mx <- x
library(data.table)
f_dt <- function(mx) {
mx <- as.data.frame(mx)
setDT(mx)
mx <- mx[, .N, by = list(V1, V2, V3)][order(-N)][1]
mx[, cbind(V1, V2, V3)]
}
f_dt(x)
f_dt2 <- function(mx) {
mx <- as.data.table(mx)
mx[, .N, by = list(V1, V2, V3)][which.max(N)[1]][, cbind(V1, V2, V3)]
}
f_dt2(x)
f_base <- function(mx) {
mx <- paste(mx[, 1], mx[, 2], mx[, 3])
mx <- sort(table(mx), decreasing = TRUE)
mx <- names(mx[1])
mx <- as.numeric(strsplit(mx, '\\s')[[1]])
matrix(nrow = 1, ncol = 3, data = mx)
}
f_base(x)
f_tabulate = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
f_tabulate(x)
f_tabulate2 = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which.max(res)
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
f_tabulate2(x)
f_low_level = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1] + 1
res = rep(0, max(splitter))
for(i in seq_along(splitter)){
res[splitter[i]] = res[splitter[i]] + 1
}
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
f_low_level(x)
library(microbenchmark)
microbenchmark(
dt = f_dt(x),
dt2 = f_dt2(x),
base = f_base(x),
tabulate = f_tabulate(x),
tabulate2 = f_tabulate2(x),
loop = f_low_level(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dt 3128.625 3250.1285 3912.6456 3821.227 4258.572 7981.140 100
# dt2 2601.559 2705.8470 3335.4357 3053.803 3683.170 7413.352 100
# base 59269.405 59960.8480 61325.7161 60555.287 61727.445 73447.540 100
# tabulate 325.112 355.5705 366.4962 360.868 367.655 519.451 100
# tabulate2 254.925 287.8670 371.1212 294.157 299.454 7335.219 100
# loop 1472.606 1560.1740 1660.2241 1575.569 1597.088 4391.001 100
PU
set.seed(123)
N = 10000
x <- matrix(nrow = N, ncol = 3, data = sample(0:31, N*3, replace = TRUE))
mx <- x
library(data.table)
f_dt <- function(mx) {
mx <- as.data.frame(mx)
setDT(mx)
mx <- mx[, .N, by = list(V1, V2, V3)][order(-N)][1]
mx[, cbind(V1, V2, V3)]
}
f_dt(x)
f_dt2 <- function(mx) {
mx <- as.data.table(mx)
mx[, .N, by = list(V1, V2, V3)][which.max(N)[1]][, cbind(V1, V2, V3)]
}
f_dt2(x)
f_base <- function(mx) {
mx <- paste(mx[, 1], mx[, 2], mx[, 3])
mx <- sort(table(mx), decreasing = TRUE)
mx <- names(mx[1])
mx <- as.numeric(strsplit(mx, '\\s')[[1]])
matrix(nrow = 1, ncol = 3, data = mx)
}
f_base(x)
f_tabulate = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
f_tabulate(x)
f_tabulate2 = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which.max(res)
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
f_tabulate2(x)
f_low_level = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1] + 1
res = rep(0, max(splitter))
for(i in seq_along(splitter)){
res[splitter[i]] = res[splitter[i]] + 1
}
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
f_low_level(x)
library(microbenchmark)
microbenchmark(
dt = f_dt(x),
dt2 = f_dt2(x),
base = f_base(x),
tabulate = f_tabulate(x),
tabulate2 = f_tabulate2(x),
loop = f_low_level(x)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dt 3128.625 3250.1285 3912.6456 3821.227 4258.572 7981.140 100
# dt2 2601.559 2705.8470 3335.4357 3053.803 3683.170 7413.352 100
# base 59269.405 59960.8480 61325.7161 60555.287 61727.445 73447.540 100
# tabulate 325.112 355.5705 366.4962 360.868 367.655 519.451 100
# tabulate2 254.925 287.8670 371.1212 294.157 299.454 7335.219 100
# loop 1472.606 1560.1740 1660.2241 1575.569 1597.088 4391.001 100
ГД
PU
ВК
Е
AS
Е
Е
AS
googleAuthR
это устаревший пакет, сейчас там через gargle
автроизация идёт, и он уже в другой директории и другие файлы кешируетЕ
AS