MacRuby : utf8
# -*- coding: utf-8 -*-
require 'benchmark'
Benchmark.bm(20) do |x|
str_utf8_long = "あいうえお" * 10000
str_utf8_long.concat("\n\n\n")
str_utf8_short = "あいうえお" * 10
str_utf8_short.concat("\n\n\n")
str_ascii = "abcdefgh" * 1000
str_ascii.concat("\n\n\n")
x.report "long utf-8 string" do
1000.times do
str_utf8_long.chomp
end
end
x.report "short utf-8 string" do
1000.times do
str_utf8_short.chomp
end
end
x.report "ascii string" do
1000.times do
str_ascii.chomp
end
end
end
= before
$ macruby bm_str_utf8.rb
user system total real
long utf-8 string 1.980000 0.080000 2.060000 ( 1.958647)
short utf-8 string 0.010000 0.000000 0.010000 ( 0.004149)
ascii string 0.010000 0.000000 0.010000 ( 0.007358)
= after
$ DYLD_LIBRARY_PATH=. ./macruby -I./lib ~/tmp/bm/bm_str_utf8.rb
user system total real
long utf-8 string 0.530000 0.050000 0.580000 ( 0.570899)
short utf-8 string 0.000000 0.000000 0.000000 ( 0.003513)
ascii string 0.000000 0.000000 0.000000 ( 0.008428)
diff --git a/string.c b/string.c
index 222e4a0..f39cf83 100644
--- a/string.c
+++ b/string.c
@@ -28,6 +28,7 @@
#include <unicode/unum.h>
#include <unicode/utrans.h>
#include <unicode/uchar.h>
+#include <unicode/utext.h>
#define SET_CLASS(dst, src) \
do{ \
@@ -546,6 +547,12 @@ str_get_uchar(rb_str_t *self, long pos)
return c;
}
+ UErrorCode status = U_ZERO_ERROR;
+ UText *ut = utext_openUTF8(NULL, self->bytes, self->length_in_bytes, &status);
+ UChar return_value = utext_char32At(ut, pos);
+ utext_close(ut);
+
+#if 0
__block UChar return_value = 0;
__block long i = 0;
str_each_uchar32(self, ^(UChar32 c, long start_index, long char_len, bool *stop) {
@@ -572,6 +579,7 @@ str_get_uchar(rb_str_t *self, long pos)
}
}
});
+#endif
return return_value;
}