Watson1978
1/27/2012 - 2:19 AM

MacRuby : patch

MacRuby : patch

# -*- coding: utf-8 -*-
require 'benchmark'

Benchmark.bm(10) do |x|
  str = "abcdef" * 10000
  s1 = str
  x.report "dup" do
    5000.times do
      s1 = str.dup
      s2 = s1.chomp
      s3 = s2.dup
    end
  end

  x.report "[0..-1]" do
    5000.times do
      str[0..-1]
    end
  end

  x.report "[0..-2]" do
    5000.times do
      str[0..-2]
    end
  end

end
diff --git a/encoding.h b/encoding.h
index 4c1d2665ec647d385b6327260ec599553ae7ab7f..0e2a3dad6a4c52adce3b605e92d8daad58148c68 100644
--- a/encoding.h
+++ b/encoding.h
@@ -59,6 +59,8 @@ typedef struct {
     long length_in_bytes;
     char *bytes;
     str_flag_t flags;
+    bool is_shared;
+    bool is_embed;
 } rb_str_t;
 
 #define RSTR(x) ((rb_str_t *)x)
diff --git a/include/ruby/intern.h b/include/ruby/intern.h
index 6be2c6602fea1ca7a8ce056776114aee926e8018..217f53d5381d295c2375f446f5881e74a1b84519 100644
--- a/include/ruby/intern.h
+++ b/include/ruby/intern.h
@@ -491,6 +491,7 @@ VALUE rb_str_format(int, const VALUE *, VALUE);
 VALUE rb_str_new(const char*, long);
 VALUE rb_str_new_cstr(const char*);
 VALUE rb_str_new2(const char*);
+VALUE rb_str_new_shared(VALUE);
 VALUE rb_str_new3(VALUE);
 VALUE rb_str_new_frozen(VALUE);
 VALUE rb_str_new4(VALUE);
diff --git a/string.c b/string.c
index bb14e07ee163eacb6f907fdd78f408a48a49d946..a723dd86f68f5689de9ee3e9116feb0a349ddb73 100644
--- a/string.c
+++ b/string.c
@@ -42,6 +42,11 @@ VALUE rb_fs;
 
 static SEL selMATCH;
 
+#define SET_SHARED(s, bool) { ((rb_str_t*)s)->is_shared = bool; }
+#define SET_EMBED(s, bool)  { ((rb_str_t*)s)->is_embed = bool; }
+#define STR_SHARED_P(s)     (((rb_str_t*)s)->is_shared == true)
+#define STR_EMBED_P(s)      (((rb_str_t*)s)->is_embed == true)
+
 // rb_str_t primitives.
 
 static void
@@ -228,6 +233,8 @@ str_alloc(VALUE klass)
     str->capacity_in_bytes = 0;
     str->length_in_bytes = 0;
     str->bytes = NULL;
+    str->is_shared = false;
+    str->is_embed = false;
     str_reset_flags(str);
 
     return str;
@@ -249,6 +256,44 @@ str_new_like(VALUE obj)
 
 static void str_resize_bytes(rb_str_t *self, long new_capacity);
 static void str_concat_bytes(rb_str_t *self, const char *bytes, long len);
+static void str_replace_with_cfstring(rb_str_t *self, CFStringRef source);
+
+static inline void
+str_shared(rb_str_t *self, rb_str_t *source)
+{
+    GC_WB(&self->bytes, source->bytes);
+
+    self->length_in_bytes = source->length_in_bytes;
+    self->capacity_in_bytes = source->capacity_in_bytes;
+    self->encoding = source->encoding;
+    if (!source->flags) {
+	str_update_flags(source);
+    }
+    self->flags = source->flags;
+
+    SET_SHARED(source, true);
+    SET_EMBED(self, false);
+}
+
+static inline void
+str_shared_to_embed(rb_str_t *self)
+{
+    if (!STR_SHARED_P(self) && STR_EMBED_P(self)) {
+	return;
+    }
+
+    const char *bytes = self->bytes;
+
+    SET_SHARED(self, false);
+    SET_EMBED(self, true);
+
+    if (self->capacity_in_bytes == self->length_in_bytes) {
+	self->capacity_in_bytes++;
+    }
+    GC_WB(&self->bytes, xmalloc(self->capacity_in_bytes));
+    memcpy(self->bytes, bytes, self->length_in_bytes);
+    self->bytes[self->length_in_bytes] = '\0';
+}
 
 static void
 str_replace_with_bytes(rb_str_t *self, const char *bytes, long len,
@@ -257,6 +302,8 @@ str_replace_with_bytes(rb_str_t *self, const char *bytes, long len,
     assert(len >= 0);
     assert(enc != NULL);
 
+    SET_SHARED(self, false)
+    SET_EMBED(self, true);
     str_reset_flags(self);
     self->encoding = enc;
     if (len > 0) {
@@ -280,8 +327,13 @@ str_replace_with_string(rb_str_t *self, rb_str_t *source)
     if (self == source) {
 	return;
     }
-    str_replace_with_bytes(self, source->bytes, source->length_in_bytes,
-	    source->encoding);
+    if (source->length_in_bytes > 0) {
+	str_shared(self, source);
+    }
+    else {
+	str_replace_with_bytes(self, source->bytes, source->length_in_bytes,
+			       source->encoding);
+    }
     if (!source->flags) {
 	str_update_flags(source);
     }
@@ -291,6 +343,7 @@ str_replace_with_string(rb_str_t *self, rb_str_t *source)
 static void
 str_append_uchar32(rb_str_t *self, UChar32 c)
 {
+    str_shared_to_embed(self);
     str_reset_flags(self);
     if ((c <= 127) && self->encoding->ascii_compatible) {
 	str_resize_bytes(self, self->length_in_bytes + 1);
@@ -327,6 +380,7 @@ str_replace_with_uchars(rb_str_t *self, const UChar *chars, long len)
 {
     assert(len >= 0);
 
+    str_shared_to_embed(self);
     str_reset_flags(self);
     self->length_in_bytes = 0;
     self->encoding = rb_encodings[ENCODING_UTF8];
@@ -606,6 +660,8 @@ str_new_copy_of_part(rb_str_t *self, long offset_in_bytes,
 	// then a part of that string is also ASCII only
 	str_set_ascii_only(str, true);
     }
+    SET_SHARED(self, false);
+    SET_EMBED(self, true);
     return str;
 }
 
@@ -783,7 +839,8 @@ str_resize_bytes(rb_str_t *self, long new_capacity)
 	rb_raise(rb_eArgError, "negative string size (or size too big)");
     }
     if (self->capacity_in_bytes < new_capacity) {
-	size_t capacity = new_capacity * 1.2;
+	str_shared_to_embed(self);
+	size_t capacity = new_capacity * 1.2 + 1;
 	if (capacity > 0){
 	    new_capacity = capacity;
 	}
@@ -804,8 +861,7 @@ static void
 str_ensure_null_terminator(rb_str_t *self)
 {
     if (self->length_in_bytes > 0
-	&& (self->capacity_in_bytes == self->length_in_bytes
-	    || self->bytes[self->length_in_bytes] != '\0')) {
+	&& self->bytes[self->length_in_bytes] != '\0') {
 	str_resize_bytes(self, self->length_in_bytes + 1);
 	self->bytes[self->length_in_bytes] = '\0';
     }
@@ -817,6 +873,7 @@ str_splice(rb_str_t *self, long pos, long len, rb_str_t *str)
     // self[pos..pos+len] = str
     assert(pos >= 0 && len >= 0);
 
+    str_shared_to_embed(self);
     if (str != NULL) {
 	str_must_have_compatible_encoding(self, str);
     }
@@ -911,6 +968,7 @@ str_concat_bytes(rb_str_t *self, const char *bytes, long len)
 
     const long new_length_in_bytes = self->length_in_bytes + len;
 
+    str_shared_to_embed(self);
     str_resize_bytes(self, new_length_in_bytes);
     memcpy(self->bytes + self->length_in_bytes, bytes, len);
     self->length_in_bytes = new_length_in_bytes;
@@ -922,6 +980,7 @@ str_concat_uchars(rb_str_t *self, const UChar *chars, long len)
     if (len == 0) {
 	return;
     }
+    str_shared_to_embed(self);
     str_reset_flags(self);
     if (IS_UTF8_ENC(self->encoding)) {
 	long new_length_in_bytes = self->length_in_bytes;
@@ -1499,6 +1558,10 @@ rstr_substr_with_cache(VALUE str, long beg, long len,
 	len = n - beg;
     }
 
+    if (beg == 0 && n == len) {
+	return rb_str_new_shared(str);
+    }
+
     rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, cache);
     OBJ_INFECT(substr, str);
     return substr == NULL ? Qnil : (VALUE)substr;
@@ -1971,6 +2034,7 @@ rstr_setbyte(VALUE self, SEL sel, VALUE idx, VALUE value)
     if (index < 0) {
 	index += RSTR(self)->length_in_bytes;
     }
+    str_shared_to_embed(RSTR(self));
     str_reset_flags(RSTR(self));
     RSTR(self)->bytes[index] = byte;
     return value;
@@ -3980,7 +4044,7 @@ rstr_sub_bang(VALUE str, SEL sel, int argc, VALUE *argv)
 	assert(count > 0);
 
 	if (block_given || !NIL_P(hash)) {
-            if (block_given) {
+	    if (block_given) {
 		rb_match_busy(match);
 		const unsigned long hash = rb_str_hash(str);
 		repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
@@ -4286,6 +4350,7 @@ rstr_change_case(VALUE str, change_case_callback_t callback)
 		char new_c = callback(c, i == 0);
 		if (new_c != c) {
 		    changed = true;
+		    str_shared_to_embed(RSTR(str));
 		    RSTR(str)->bytes[i] = new_c;
 		}
 	    }
@@ -4302,6 +4367,7 @@ rstr_change_case(VALUE str, change_case_callback_t callback)
 		char new_c = callback(c, start_index == 0);
 		if (new_c != c) {
 		    changed = true;
+		    str_shared_to_embed(RSTR(str));
 		    memset(&RSTR(str)->bytes[start_index], 0, char_len);
 		    if (RSTR(str)->encoding->little_endian) {
 			RSTR(str)->bytes[start_index] = new_c;
@@ -5259,6 +5325,8 @@ rstr_reverse_bang(VALUE str, SEL sel)
 
     RSTR(str)->capacity_in_bytes = RSTR(str)->length_in_bytes;
     GC_WB(&RSTR(str)->bytes, new_bytes);
+    SET_SHARED(str, false);
+    SET_EMBED(str, true);
 
     // we modify it directly so the information stored
     // in the facultative flags might be outdated
@@ -6371,10 +6439,17 @@ rb_str_new2(const char *cstr)
 VALUE
 rb_str_new3(VALUE source)
 {
-    rb_str_t *str = str_alloc(rb_obj_class(source));
-    str_replace(str, source);
-    OBJ_INFECT(str, source);
-    return (VALUE)str;
+    rb_str_t *str1 = str_alloc(rb_obj_class(source));
+    rb_str_t *str2 = str_need_string(source);
+    str_shared(str1, str2);
+    OBJ_INFECT(str1, str2);
+    return (VALUE)str1;
+}
+
+VALUE
+rb_str_new_shared(VALUE source)
+{
+    return rb_str_new3(source);
 }
 
 VALUE