VV
Size: a a a
VV
PU
ГД
which.max
вернёт первую и не гарантирует стабильность результата.which(res %in% max(res))
вернет коды цветов, которые чаще всего встречаютсяPU
which(res %in% max(res))
вернет коды цветов, которые чаще всего встречаются> matr <- x
> splitter = matr[,3]*100*100 + matr[,2]*100 + matr[,1]
> res = tabulate(splitter)
> which(splitter == which.max(res))
[1] 1 3 5
> f_base(x)
[,1] [,2] [,3]
[1,] 1 2 1
ГД
ГД
PU
> f_base(x)
[,1] [,2] [,3]
[1,] 1 2 1
> f_dt(x)
[,1] [,2] [,3]
[1,] 1 2 1
PU
ГД
set.seed(123)
N = 10000
x <- matrix(nrow = N, ncol = 3, data = sample(0:31, N*3, replace = TRUE))
ГД
ГД
set.seed(123)
N = 10000
x <- matrix(nrow = N, ncol = 3, data = sample(0:31, N*3, replace = TRUE))
mx <- x
library(data.table)
f_dt <- function(mx) {
mx <- as.data.frame(mx)
setDT(mx)
mx <- mx[, .N, by = list(V1, V2, V3)][order(-N)][1]
mx[, cbind(V1, V2, V3)]
}
f_dt(x)
f_base <- function(mx) {
mx <- paste(mx[, 1], mx[, 2], mx[, 3])
mx <- sort(table(mx), decreasing = TRUE)
mx <- names(mx[1])
mx <- as.numeric(strsplit(mx, '\\s')[[1]])
matrix(nrow = 1, ncol = 3, data = mx)
}
f_base(x)
f_tabulate = function(mx){
mult = 100
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
f_tabulate(x)
library(microbenchmark)
microbenchmark(
dt = f_dt(x),
base = f_base(x),
tabulate = f_tabulate(x)
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# dt 3.108099 3.325447 3.886822 3.967064 4.221989 9.307412 100
# base 57.220403 59.190113 60.389002 60.283311 61.184157 66.289113 100
# tabulate 1.217681 1.334715 2.162456 1.357393 1.439168 60.760551 100
АК
Rcpp
:// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
using namespace Rcpp;
template <typename T>
class hasher {
public:
std::size_t operator()(const T& vec) const {
size_t seed = vec.size();
for(auto& i : vec) {
seed ^= i + 0x9e3779b9 + (seed << 6) + (seed >> 2);
}
return seed;
}
};
// [[Rcpp::export]]
List count_rows(IntegerMatrix x) {
size_t nrows = x.rows();
hasher<IntegerMatrix::Row> hash_fn;
std::vector<std::string> hashes(nrows);
std::unordered_map<std::string,int> cnt;
for (size_t i = 0; i < nrows; ++i) {
IntegerMatrix::Row ri = x.row(i);
std::string h = std::to_string(hash_fn(ri));
hashes[i] = h;
cnt[h]++;
}
List res = List::create(
Named("hash") = hashes,
Named("counts") = cnt
);
return res;
}
/*** R
set.seed(123)
N = 10000
x <- matrix(nrow = N, ncol = 3, data = sample(0:31, N*3, replace = TRUE))
res <- count_rows(x)
h <- names(res$counts)[max(res$counts) == res$counts]
match(h, res$hash)
*/
АК
АК
res$counts
частоты хэшей, а в res$hash
хэш для каждой строки.АК
ГД
f_tabulate = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
# Unit: microseconds
# expr min lq mean median uq max neval
# dt 3098.166 3185.073 3728.0666 3334.8835 4014.0755 12412.863 100
# base 57688.870 58858.048 60201.7815 59508.4385 60817.6605 70619.858 100
# tabulate 314.518 352.260 409.4361 359.3785 364.3445 2914.091 100
АК
f_tabulate = function(mx){
mult = 32
splitter = mx[,3]*mult*mult + mx[,2]*mult + mx[,1]
res = tabulate(splitter)
res = which(res == max(res))[1]
mx[splitter==res, ,drop = FALSE][1, ,drop = FALSE]
}
# Unit: microseconds
# expr min lq mean median uq max neval
# dt 3098.166 3185.073 3728.0666 3334.8835 4014.0755 12412.863 100
# base 57688.870 58858.048 60201.7815 59508.4385 60817.6605 70619.858 100
# tabulate 314.518 352.260 409.4361 359.3785 364.3445 2914.091 100
> length(splitter)
[1] 10000
> length(tabulate(splitter))
[1] 32759
ГД
> length(splitter)
[1] 10000
> length(tabulate(splitter))
[1] 32759
ГД