00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "internal.h"
00018 #include <assert.h>
00019
00020 #define BEG(no) (regs->beg[(no)])
00021 #define END(no) (regs->end[(no)])
00022
00023 #include <math.h>
00024 #include <ctype.h>
00025
00026 #ifdef HAVE_UNISTD_H
00027 #include <unistd.h>
00028 #endif
00029
00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00031
00032 #undef rb_str_new_cstr
00033 #undef rb_tainted_str_new_cstr
00034 #undef rb_usascii_str_new_cstr
00035 #undef rb_external_str_new_cstr
00036 #undef rb_locale_str_new_cstr
00037 #undef rb_str_new2
00038 #undef rb_str_new3
00039 #undef rb_str_new4
00040 #undef rb_str_new5
00041 #undef rb_tainted_str_new2
00042 #undef rb_usascii_str_new2
00043 #undef rb_str_dup_frozen
00044 #undef rb_str_buf_new_cstr
00045 #undef rb_str_buf_new2
00046 #undef rb_str_buf_cat2
00047 #undef rb_str_cat2
00048
00049 static VALUE rb_str_clear(VALUE str);
00050
00051 VALUE rb_cString;
00052 VALUE rb_cSymbol;
00053
00054 #define RUBY_MAX_CHAR_LEN 16
00055 #define STR_TMPLOCK FL_USER7
00056 #define STR_NOEMBED FL_USER1
00057 #define STR_SHARED FL_USER2
00058 #define STR_ASSOC FL_USER3
00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00060 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00061 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00063 #define STR_UNSET_NOCAPA(s) do {\
00064 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00065 } while (0)
00066
00067
00068 #define STR_SET_NOEMBED(str) do {\
00069 FL_SET((str), STR_NOEMBED);\
00070 STR_SET_EMBED_LEN((str), 0);\
00071 } while (0)
00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00074 #define STR_SET_EMBED_LEN(str, n) do { \
00075 long tmp_n = (n);\
00076 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00077 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00078 } while (0)
00079
00080 #define STR_SET_LEN(str, n) do { \
00081 if (STR_EMBED_P(str)) {\
00082 STR_SET_EMBED_LEN((str), (n));\
00083 }\
00084 else {\
00085 RSTRING(str)->as.heap.len = (n);\
00086 }\
00087 } while (0)
00088
00089 #define STR_DEC_LEN(str) do {\
00090 if (STR_EMBED_P(str)) {\
00091 long n = RSTRING_LEN(str);\
00092 n--;\
00093 STR_SET_EMBED_LEN((str), n);\
00094 }\
00095 else {\
00096 RSTRING(str)->as.heap.len--;\
00097 }\
00098 } while (0)
00099
00100 #define RESIZE_CAPA(str,capacity) do {\
00101 if (STR_EMBED_P(str)) {\
00102 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00103 char *tmp = ALLOC_N(char, (capacity)+1);\
00104 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00105 RSTRING(str)->as.heap.ptr = tmp;\
00106 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00107 STR_SET_NOEMBED(str);\
00108 RSTRING(str)->as.heap.aux.capa = (capacity);\
00109 }\
00110 }\
00111 else {\
00112 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00113 if (!STR_NOCAPA_P(str))\
00114 RSTRING(str)->as.heap.aux.capa = (capacity);\
00115 }\
00116 } while (0)
00117
00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00120
00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00122
00123 static inline int
00124 single_byte_optimizable(VALUE str)
00125 {
00126 rb_encoding *enc;
00127
00128
00129 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00130 return 1;
00131
00132 enc = STR_ENC_GET(str);
00133 if (rb_enc_mbmaxlen(enc) == 1)
00134 return 1;
00135
00136
00137
00138 return 0;
00139 }
00140
00141 VALUE rb_fs;
00142
00143 static inline const char *
00144 search_nonascii(const char *p, const char *e)
00145 {
00146 #if SIZEOF_VALUE == 8
00147 # define NONASCII_MASK 0x8080808080808080ULL
00148 #elif SIZEOF_VALUE == 4
00149 # define NONASCII_MASK 0x80808080UL
00150 #endif
00151 #ifdef NONASCII_MASK
00152 if ((int)sizeof(VALUE) * 2 < e - p) {
00153 const VALUE *s, *t;
00154 const VALUE lowbits = sizeof(VALUE) - 1;
00155 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00156 while (p < (const char *)s) {
00157 if (!ISASCII(*p))
00158 return p;
00159 p++;
00160 }
00161 t = (const VALUE*)(~lowbits & (VALUE)e);
00162 while (s < t) {
00163 if (*s & NONASCII_MASK) {
00164 t = s;
00165 break;
00166 }
00167 s++;
00168 }
00169 p = (const char *)t;
00170 }
00171 #endif
00172 while (p < e) {
00173 if (!ISASCII(*p))
00174 return p;
00175 p++;
00176 }
00177 return NULL;
00178 }
00179
00180 static int
00181 coderange_scan(const char *p, long len, rb_encoding *enc)
00182 {
00183 const char *e = p + len;
00184
00185 if (rb_enc_to_index(enc) == 0) {
00186
00187 p = search_nonascii(p, e);
00188 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00189 }
00190
00191 if (rb_enc_asciicompat(enc)) {
00192 p = search_nonascii(p, e);
00193 if (!p) {
00194 return ENC_CODERANGE_7BIT;
00195 }
00196 while (p < e) {
00197 int ret = rb_enc_precise_mbclen(p, e, enc);
00198 if (!MBCLEN_CHARFOUND_P(ret)) {
00199 return ENC_CODERANGE_BROKEN;
00200 }
00201 p += MBCLEN_CHARFOUND_LEN(ret);
00202 if (p < e) {
00203 p = search_nonascii(p, e);
00204 if (!p) {
00205 return ENC_CODERANGE_VALID;
00206 }
00207 }
00208 }
00209 if (e < p) {
00210 return ENC_CODERANGE_BROKEN;
00211 }
00212 return ENC_CODERANGE_VALID;
00213 }
00214
00215 while (p < e) {
00216 int ret = rb_enc_precise_mbclen(p, e, enc);
00217
00218 if (!MBCLEN_CHARFOUND_P(ret)) {
00219 return ENC_CODERANGE_BROKEN;
00220 }
00221 p += MBCLEN_CHARFOUND_LEN(ret);
00222 }
00223 if (e < p) {
00224 return ENC_CODERANGE_BROKEN;
00225 }
00226 return ENC_CODERANGE_VALID;
00227 }
00228
00229 long
00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00231 {
00232 const char *p = s;
00233
00234 if (*cr == ENC_CODERANGE_BROKEN)
00235 return e - s;
00236
00237 if (rb_enc_to_index(enc) == 0) {
00238
00239 p = search_nonascii(p, e);
00240 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00241 return e - s;
00242 }
00243 else if (rb_enc_asciicompat(enc)) {
00244 p = search_nonascii(p, e);
00245 if (!p) {
00246 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00247 return e - s;
00248 }
00249 while (p < e) {
00250 int ret = rb_enc_precise_mbclen(p, e, enc);
00251 if (!MBCLEN_CHARFOUND_P(ret)) {
00252 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00253 return p - s;
00254 }
00255 p += MBCLEN_CHARFOUND_LEN(ret);
00256 if (p < e) {
00257 p = search_nonascii(p, e);
00258 if (!p) {
00259 *cr = ENC_CODERANGE_VALID;
00260 return e - s;
00261 }
00262 }
00263 }
00264 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00265 return p - s;
00266 }
00267 else {
00268 while (p < e) {
00269 int ret = rb_enc_precise_mbclen(p, e, enc);
00270 if (!MBCLEN_CHARFOUND_P(ret)) {
00271 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00272 return p - s;
00273 }
00274 p += MBCLEN_CHARFOUND_LEN(ret);
00275 }
00276 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00277 return p - s;
00278 }
00279 }
00280
00281 static inline void
00282 str_enc_copy(VALUE str1, VALUE str2)
00283 {
00284 rb_enc_set_index(str1, ENCODING_GET(str2));
00285 }
00286
00287 static void
00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00289 {
00290
00291
00292
00293 str_enc_copy(dest, src);
00294 switch (ENC_CODERANGE(src)) {
00295 case ENC_CODERANGE_7BIT:
00296 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00297 break;
00298 case ENC_CODERANGE_VALID:
00299 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00300 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00301 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00302 else
00303 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00304 break;
00305 default:
00306 if (RSTRING_LEN(dest) == 0) {
00307 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00308 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00309 else
00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00311 }
00312 break;
00313 }
00314 }
00315
00316 static void
00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00318 {
00319 str_enc_copy(dest, src);
00320 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00321 }
00322
00323 int
00324 rb_enc_str_coderange(VALUE str)
00325 {
00326 int cr = ENC_CODERANGE(str);
00327
00328 if (cr == ENC_CODERANGE_UNKNOWN) {
00329 rb_encoding *enc = STR_ENC_GET(str);
00330 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00331 ENC_CODERANGE_SET(str, cr);
00332 }
00333 return cr;
00334 }
00335
00336 int
00337 rb_enc_str_asciionly_p(VALUE str)
00338 {
00339 rb_encoding *enc = STR_ENC_GET(str);
00340
00341 if (!rb_enc_asciicompat(enc))
00342 return FALSE;
00343 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00344 return TRUE;
00345 return FALSE;
00346 }
00347
00348 static inline void
00349 str_mod_check(VALUE s, const char *p, long len)
00350 {
00351 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00352 rb_raise(rb_eRuntimeError, "string modified");
00353 }
00354 }
00355
00356 size_t
00357 rb_str_capacity(VALUE str)
00358 {
00359 if (STR_EMBED_P(str)) {
00360 return RSTRING_EMBED_LEN_MAX;
00361 }
00362 else if (STR_NOCAPA_P(str)) {
00363 return RSTRING(str)->as.heap.len;
00364 }
00365 else {
00366 return RSTRING(str)->as.heap.aux.capa;
00367 }
00368 }
00369
00370 static inline VALUE
00371 str_alloc(VALUE klass)
00372 {
00373 NEWOBJ(str, struct RString);
00374 OBJSETUP(str, klass, T_STRING);
00375
00376 str->as.heap.ptr = 0;
00377 str->as.heap.len = 0;
00378 str->as.heap.aux.capa = 0;
00379
00380 return (VALUE)str;
00381 }
00382
00383 static VALUE
00384 str_new(VALUE klass, const char *ptr, long len)
00385 {
00386 VALUE str;
00387
00388 if (len < 0) {
00389 rb_raise(rb_eArgError, "negative string size (or size too big)");
00390 }
00391
00392 str = str_alloc(klass);
00393 if (len > RSTRING_EMBED_LEN_MAX) {
00394 RSTRING(str)->as.heap.aux.capa = len;
00395 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00396 STR_SET_NOEMBED(str);
00397 }
00398 else if (len == 0) {
00399 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00400 }
00401 if (ptr) {
00402 memcpy(RSTRING_PTR(str), ptr, len);
00403 }
00404 STR_SET_LEN(str, len);
00405 RSTRING_PTR(str)[len] = '\0';
00406 return str;
00407 }
00408
00409 VALUE
00410 rb_str_new(const char *ptr, long len)
00411 {
00412 return str_new(rb_cString, ptr, len);
00413 }
00414
00415 VALUE
00416 rb_usascii_str_new(const char *ptr, long len)
00417 {
00418 VALUE str = rb_str_new(ptr, len);
00419 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00420 return str;
00421 }
00422
00423 VALUE
00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00425 {
00426 VALUE str = rb_str_new(ptr, len);
00427 rb_enc_associate(str, enc);
00428 return str;
00429 }
00430
00431 VALUE
00432 rb_str_new_cstr(const char *ptr)
00433 {
00434 if (!ptr) {
00435 rb_raise(rb_eArgError, "NULL pointer given");
00436 }
00437 return rb_str_new(ptr, strlen(ptr));
00438 }
00439
00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00441 #define rb_str_new2 rb_str_new_cstr
00442
00443 VALUE
00444 rb_usascii_str_new_cstr(const char *ptr)
00445 {
00446 VALUE str = rb_str_new2(ptr);
00447 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00448 return str;
00449 }
00450
00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00453
00454 VALUE
00455 rb_tainted_str_new(const char *ptr, long len)
00456 {
00457 VALUE str = rb_str_new(ptr, len);
00458
00459 OBJ_TAINT(str);
00460 return str;
00461 }
00462
00463 VALUE
00464 rb_tainted_str_new_cstr(const char *ptr)
00465 {
00466 VALUE str = rb_str_new2(ptr);
00467
00468 OBJ_TAINT(str);
00469 return str;
00470 }
00471
00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00474
00475 VALUE
00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00477 {
00478 rb_econv_t *ec;
00479 rb_econv_result_t ret;
00480 long len;
00481 VALUE newstr;
00482 const unsigned char *sp;
00483 unsigned char *dp;
00484
00485 if (!to) return str;
00486 if (from == to) return str;
00487 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00488 to == rb_ascii8bit_encoding()) {
00489 if (STR_ENC_GET(str) != to) {
00490 str = rb_str_dup(str);
00491 rb_enc_associate(str, to);
00492 }
00493 return str;
00494 }
00495
00496 len = RSTRING_LEN(str);
00497 newstr = rb_str_new(0, len);
00498
00499 retry:
00500 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00501 if (!ec) return str;
00502
00503 sp = (unsigned char*)RSTRING_PTR(str);
00504 dp = (unsigned char*)RSTRING_PTR(newstr);
00505 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00506 &dp, (unsigned char*)RSTRING_END(newstr), 0);
00507 rb_econv_close(ec);
00508 switch (ret) {
00509 case econv_destination_buffer_full:
00510
00511 len = len < 2 ? 2 : len * 2;
00512 rb_str_resize(newstr, len);
00513 goto retry;
00514
00515 case econv_finished:
00516 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00517 rb_str_set_len(newstr, len);
00518 rb_enc_associate(newstr, to);
00519 return newstr;
00520
00521 default:
00522
00523 return str;
00524 }
00525 }
00526
00527 VALUE
00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00529 {
00530 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00531 }
00532
00533 VALUE
00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00535 {
00536 VALUE str;
00537
00538 str = rb_tainted_str_new(ptr, len);
00539 if (eenc == rb_usascii_encoding() &&
00540 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00541 rb_enc_associate(str, rb_ascii8bit_encoding());
00542 return str;
00543 }
00544 rb_enc_associate(str, eenc);
00545 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00546 }
00547
00548 VALUE
00549 rb_external_str_new(const char *ptr, long len)
00550 {
00551 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00552 }
00553
00554 VALUE
00555 rb_external_str_new_cstr(const char *ptr)
00556 {
00557 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00558 }
00559
00560 VALUE
00561 rb_locale_str_new(const char *ptr, long len)
00562 {
00563 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00564 }
00565
00566 VALUE
00567 rb_locale_str_new_cstr(const char *ptr)
00568 {
00569 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00570 }
00571
00572 VALUE
00573 rb_filesystem_str_new(const char *ptr, long len)
00574 {
00575 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00576 }
00577
00578 VALUE
00579 rb_filesystem_str_new_cstr(const char *ptr)
00580 {
00581 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00582 }
00583
00584 VALUE
00585 rb_str_export(VALUE str)
00586 {
00587 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00588 }
00589
00590 VALUE
00591 rb_str_export_locale(VALUE str)
00592 {
00593 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00594 }
00595
00596 VALUE
00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00598 {
00599 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00600 }
00601
00602 static VALUE
00603 str_replace_shared(VALUE str2, VALUE str)
00604 {
00605 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00606 STR_SET_EMBED(str2);
00607 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00608 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00609 }
00610 else {
00611 str = rb_str_new_frozen(str);
00612 FL_SET(str2, STR_NOEMBED);
00613 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00614 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00615 RSTRING(str2)->as.heap.aux.shared = str;
00616 FL_SET(str2, ELTS_SHARED);
00617 }
00618 rb_enc_cr_str_exact_copy(str2, str);
00619
00620 return str2;
00621 }
00622
00623 static VALUE
00624 str_new_shared(VALUE klass, VALUE str)
00625 {
00626 return str_replace_shared(str_alloc(klass), str);
00627 }
00628
00629 static VALUE
00630 str_new3(VALUE klass, VALUE str)
00631 {
00632 return str_new_shared(klass, str);
00633 }
00634
00635 VALUE
00636 rb_str_new_shared(VALUE str)
00637 {
00638 VALUE str2 = str_new3(rb_obj_class(str), str);
00639
00640 OBJ_INFECT(str2, str);
00641 return str2;
00642 }
00643
00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00645 #define rb_str_new3 rb_str_new_shared
00646
00647 static VALUE
00648 str_new4(VALUE klass, VALUE str)
00649 {
00650 VALUE str2;
00651
00652 str2 = str_alloc(klass);
00653 STR_SET_NOEMBED(str2);
00654 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00655 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00656 if (STR_SHARED_P(str)) {
00657 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00658 assert(OBJ_FROZEN(shared));
00659 FL_SET(str2, ELTS_SHARED);
00660 RSTRING(str2)->as.heap.aux.shared = shared;
00661 }
00662 else {
00663 FL_SET(str, ELTS_SHARED);
00664 RSTRING(str)->as.heap.aux.shared = str2;
00665 }
00666 rb_enc_cr_str_exact_copy(str2, str);
00667 OBJ_INFECT(str2, str);
00668 return str2;
00669 }
00670
00671 VALUE
00672 rb_str_new_frozen(VALUE orig)
00673 {
00674 VALUE klass, str;
00675
00676 if (OBJ_FROZEN(orig)) return orig;
00677 klass = rb_obj_class(orig);
00678 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00679 long ofs;
00680 assert(OBJ_FROZEN(str));
00681 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00682 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00683 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00684 ENCODING_GET(str) != ENCODING_GET(orig)) {
00685 str = str_new3(klass, str);
00686 RSTRING(str)->as.heap.ptr += ofs;
00687 RSTRING(str)->as.heap.len -= ofs;
00688 rb_enc_cr_str_exact_copy(str, orig);
00689 OBJ_INFECT(str, orig);
00690 }
00691 }
00692 else if (STR_EMBED_P(orig)) {
00693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00694 rb_enc_cr_str_exact_copy(str, orig);
00695 OBJ_INFECT(str, orig);
00696 }
00697 else if (STR_ASSOC_P(orig)) {
00698 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00699 FL_UNSET(orig, STR_ASSOC);
00700 str = str_new4(klass, orig);
00701 FL_SET(str, STR_ASSOC);
00702 RSTRING(str)->as.heap.aux.shared = assoc;
00703 }
00704 else {
00705 str = str_new4(klass, orig);
00706 }
00707 OBJ_FREEZE(str);
00708 return str;
00709 }
00710
00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00712 #define rb_str_new4 rb_str_new_frozen
00713
00714 VALUE
00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00716 {
00717 return str_new(rb_obj_class(obj), ptr, len);
00718 }
00719
00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00721 rb_str_new_with_class, (obj, ptr, len))
00722 #define rb_str_new5 rb_str_new_with_class
00723
00724 static VALUE
00725 str_new_empty(VALUE str)
00726 {
00727 VALUE v = rb_str_new5(str, 0, 0);
00728 rb_enc_copy(v, str);
00729 OBJ_INFECT(v, str);
00730 return v;
00731 }
00732
00733 #define STR_BUF_MIN_SIZE 128
00734
00735 VALUE
00736 rb_str_buf_new(long capa)
00737 {
00738 VALUE str = str_alloc(rb_cString);
00739
00740 if (capa < STR_BUF_MIN_SIZE) {
00741 capa = STR_BUF_MIN_SIZE;
00742 }
00743 FL_SET(str, STR_NOEMBED);
00744 RSTRING(str)->as.heap.aux.capa = capa;
00745 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00746 RSTRING(str)->as.heap.ptr[0] = '\0';
00747
00748 return str;
00749 }
00750
00751 VALUE
00752 rb_str_buf_new_cstr(const char *ptr)
00753 {
00754 VALUE str;
00755 long len = strlen(ptr);
00756
00757 str = rb_str_buf_new(len);
00758 rb_str_buf_cat(str, ptr, len);
00759
00760 return str;
00761 }
00762
00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00764 #define rb_str_buf_new2 rb_str_buf_new_cstr
00765
00766 VALUE
00767 rb_str_tmp_new(long len)
00768 {
00769 return str_new(0, 0, len);
00770 }
00771
00772 void *
00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00774 {
00775 VALUE s = rb_str_tmp_new(len);
00776 *store = s;
00777 return RSTRING_PTR(s);
00778 }
00779
00780 void
00781 rb_free_tmp_buffer(volatile VALUE *store)
00782 {
00783 VALUE s = *store;
00784 *store = 0;
00785 if (s) rb_str_clear(s);
00786 }
00787
00788 void
00789 rb_str_free(VALUE str)
00790 {
00791 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00792 xfree(RSTRING(str)->as.heap.ptr);
00793 }
00794 }
00795
00796 RUBY_FUNC_EXPORTED size_t
00797 rb_str_memsize(VALUE str)
00798 {
00799 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00800 return RSTRING(str)->as.heap.aux.capa;
00801 }
00802 else {
00803 return 0;
00804 }
00805 }
00806
00807 VALUE
00808 rb_str_to_str(VALUE str)
00809 {
00810 return rb_convert_type(str, T_STRING, "String", "to_str");
00811 }
00812
00813 static inline void str_discard(VALUE str);
00814
00815 void
00816 rb_str_shared_replace(VALUE str, VALUE str2)
00817 {
00818 rb_encoding *enc;
00819 int cr;
00820 if (str == str2) return;
00821 enc = STR_ENC_GET(str2);
00822 cr = ENC_CODERANGE(str2);
00823 str_discard(str);
00824 OBJ_INFECT(str, str2);
00825 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00826 STR_SET_EMBED(str);
00827 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00828 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00829 rb_enc_associate(str, enc);
00830 ENC_CODERANGE_SET(str, cr);
00831 return;
00832 }
00833 STR_SET_NOEMBED(str);
00834 STR_UNSET_NOCAPA(str);
00835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00836 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00837 if (STR_NOCAPA_P(str2)) {
00838 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00839 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00840 }
00841 else {
00842 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00843 }
00844 STR_SET_EMBED(str2);
00845 RSTRING_PTR(str2)[0] = 0;
00846 STR_SET_EMBED_LEN(str2, 0);
00847 rb_enc_associate(str, enc);
00848 ENC_CODERANGE_SET(str, cr);
00849 }
00850
00851 static ID id_to_s;
00852
00853 VALUE
00854 rb_obj_as_string(VALUE obj)
00855 {
00856 VALUE str;
00857
00858 if (TYPE(obj) == T_STRING) {
00859 return obj;
00860 }
00861 str = rb_funcall(obj, id_to_s, 0);
00862 if (TYPE(str) != T_STRING)
00863 return rb_any_to_s(obj);
00864 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00865 return str;
00866 }
00867
00868 static VALUE
00869 str_replace(VALUE str, VALUE str2)
00870 {
00871 long len;
00872
00873 len = RSTRING_LEN(str2);
00874 if (STR_ASSOC_P(str2)) {
00875 str2 = rb_str_new4(str2);
00876 }
00877 if (STR_SHARED_P(str2)) {
00878 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00879 assert(OBJ_FROZEN(shared));
00880 STR_SET_NOEMBED(str);
00881 RSTRING(str)->as.heap.len = len;
00882 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00883 FL_SET(str, ELTS_SHARED);
00884 FL_UNSET(str, STR_ASSOC);
00885 RSTRING(str)->as.heap.aux.shared = shared;
00886 }
00887 else {
00888 str_replace_shared(str, str2);
00889 }
00890
00891 OBJ_INFECT(str, str2);
00892 rb_enc_cr_str_exact_copy(str, str2);
00893 return str;
00894 }
00895
00896 static VALUE
00897 str_duplicate(VALUE klass, VALUE str)
00898 {
00899 VALUE dup = str_alloc(klass);
00900 str_replace(dup, str);
00901 return dup;
00902 }
00903
00904 VALUE
00905 rb_str_dup(VALUE str)
00906 {
00907 return str_duplicate(rb_obj_class(str), str);
00908 }
00909
00910 VALUE
00911 rb_str_resurrect(VALUE str)
00912 {
00913 return str_replace(str_alloc(rb_cString), str);
00914 }
00915
00916
00917
00918
00919
00920
00921
00922
00923 static VALUE
00924 rb_str_init(int argc, VALUE *argv, VALUE str)
00925 {
00926 VALUE orig;
00927
00928 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00929 rb_str_replace(str, orig);
00930 return str;
00931 }
00932
00933 static inline long
00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00935 {
00936 long c;
00937 const char *q;
00938
00939 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00940 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00941 }
00942 else if (rb_enc_asciicompat(enc)) {
00943 c = 0;
00944 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00945 while (p < e) {
00946 if (ISASCII(*p)) {
00947 q = search_nonascii(p, e);
00948 if (!q)
00949 return c + (e - p);
00950 c += q - p;
00951 p = q;
00952 }
00953 p += rb_enc_fast_mbclen(p, e, enc);
00954 c++;
00955 }
00956 }
00957 else {
00958 while (p < e) {
00959 if (ISASCII(*p)) {
00960 q = search_nonascii(p, e);
00961 if (!q)
00962 return c + (e - p);
00963 c += q - p;
00964 p = q;
00965 }
00966 p += rb_enc_mbclen(p, e, enc);
00967 c++;
00968 }
00969 }
00970 return c;
00971 }
00972
00973 for (c=0; p<e; c++) {
00974 p += rb_enc_mbclen(p, e, enc);
00975 }
00976 return c;
00977 }
00978
00979 long
00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00981 {
00982 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00983 }
00984
00985 long
00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00987 {
00988 long c;
00989 const char *q;
00990 int ret;
00991
00992 *cr = 0;
00993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00994 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00995 }
00996 else if (rb_enc_asciicompat(enc)) {
00997 c = 0;
00998 while (p < e) {
00999 if (ISASCII(*p)) {
01000 q = search_nonascii(p, e);
01001 if (!q) {
01002 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01003 return c + (e - p);
01004 }
01005 c += q - p;
01006 p = q;
01007 }
01008 ret = rb_enc_precise_mbclen(p, e, enc);
01009 if (MBCLEN_CHARFOUND_P(ret)) {
01010 *cr |= ENC_CODERANGE_VALID;
01011 p += MBCLEN_CHARFOUND_LEN(ret);
01012 }
01013 else {
01014 *cr = ENC_CODERANGE_BROKEN;
01015 p++;
01016 }
01017 c++;
01018 }
01019 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01020 return c;
01021 }
01022
01023 for (c=0; p<e; c++) {
01024 ret = rb_enc_precise_mbclen(p, e, enc);
01025 if (MBCLEN_CHARFOUND_P(ret)) {
01026 *cr |= ENC_CODERANGE_VALID;
01027 p += MBCLEN_CHARFOUND_LEN(ret);
01028 }
01029 else {
01030 *cr = ENC_CODERANGE_BROKEN;
01031 if (p + rb_enc_mbminlen(enc) <= e)
01032 p += rb_enc_mbminlen(enc);
01033 else
01034 p = e;
01035 }
01036 }
01037 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01038 return c;
01039 }
01040
01041 #ifdef NONASCII_MASK
01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01043
01044
01045
01046
01047
01048
01049
01050
01051
01052
01053
01054
01055
01056 static inline VALUE
01057 count_utf8_lead_bytes_with_word(const VALUE *s)
01058 {
01059 VALUE d = *s;
01060
01061
01062 d |= ~(d>>1);
01063 d >>= 6;
01064 d &= NONASCII_MASK >> 7;
01065
01066
01067 d += (d>>8);
01068 d += (d>>16);
01069 #if SIZEOF_VALUE == 8
01070 d += (d>>32);
01071 #endif
01072 return (d&0xF);
01073 }
01074 #endif
01075
01076 static long
01077 str_strlen(VALUE str, rb_encoding *enc)
01078 {
01079 const char *p, *e;
01080 long n;
01081 int cr;
01082
01083 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01084 if (!enc) enc = STR_ENC_GET(str);
01085 p = RSTRING_PTR(str);
01086 e = RSTRING_END(str);
01087 cr = ENC_CODERANGE(str);
01088 #ifdef NONASCII_MASK
01089 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01090 enc == rb_utf8_encoding()) {
01091
01092 VALUE len = 0;
01093 if ((int)sizeof(VALUE) * 2 < e - p) {
01094 const VALUE *s, *t;
01095 const VALUE lowbits = sizeof(VALUE) - 1;
01096 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01097 t = (const VALUE*)(~lowbits & (VALUE)e);
01098 while (p < (const char *)s) {
01099 if (is_utf8_lead_byte(*p)) len++;
01100 p++;
01101 }
01102 while (s < t) {
01103 len += count_utf8_lead_bytes_with_word(s);
01104 s++;
01105 }
01106 p = (const char *)s;
01107 }
01108 while (p < e) {
01109 if (is_utf8_lead_byte(*p)) len++;
01110 p++;
01111 }
01112 return (long)len;
01113 }
01114 #endif
01115 n = rb_enc_strlen_cr(p, e, enc, &cr);
01116 if (cr) {
01117 ENC_CODERANGE_SET(str, cr);
01118 }
01119 return n;
01120 }
01121
01122 long
01123 rb_str_strlen(VALUE str)
01124 {
01125 return str_strlen(str, STR_ENC_GET(str));
01126 }
01127
01128
01129
01130
01131
01132
01133
01134
01135
01136 VALUE
01137 rb_str_length(VALUE str)
01138 {
01139 long len;
01140
01141 len = str_strlen(str, STR_ENC_GET(str));
01142 return LONG2NUM(len);
01143 }
01144
01145
01146
01147
01148
01149
01150
01151
01152 static VALUE
01153 rb_str_bytesize(VALUE str)
01154 {
01155 return LONG2NUM(RSTRING_LEN(str));
01156 }
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168 static VALUE
01169 rb_str_empty(VALUE str)
01170 {
01171 if (RSTRING_LEN(str) == 0)
01172 return Qtrue;
01173 return Qfalse;
01174 }
01175
01176
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186 VALUE
01187 rb_str_plus(VALUE str1, VALUE str2)
01188 {
01189 VALUE str3;
01190 rb_encoding *enc;
01191
01192 StringValue(str2);
01193 enc = rb_enc_check(str1, str2);
01194 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01195 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01196 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01197 RSTRING_PTR(str2), RSTRING_LEN(str2));
01198 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01199
01200 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01201 OBJ_TAINT(str3);
01202 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01203 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01204 return str3;
01205 }
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217 VALUE
01218 rb_str_times(VALUE str, VALUE times)
01219 {
01220 VALUE str2;
01221 long n, len;
01222 char *ptr2;
01223
01224 len = NUM2LONG(times);
01225 if (len < 0) {
01226 rb_raise(rb_eArgError, "negative argument");
01227 }
01228 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01229 rb_raise(rb_eArgError, "argument too big");
01230 }
01231
01232 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01233 ptr2 = RSTRING_PTR(str2);
01234 if (len) {
01235 n = RSTRING_LEN(str);
01236 memcpy(ptr2, RSTRING_PTR(str), n);
01237 while (n <= len/2) {
01238 memcpy(ptr2 + n, ptr2, n);
01239 n *= 2;
01240 }
01241 memcpy(ptr2 + n, ptr2, len-n);
01242 }
01243 ptr2[RSTRING_LEN(str2)] = '\0';
01244 OBJ_INFECT(str2, str);
01245 rb_enc_cr_str_copy_for_substr(str2, str);
01246
01247 return str2;
01248 }
01249
01250
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265 static VALUE
01266 rb_str_format_m(VALUE str, VALUE arg)
01267 {
01268 volatile VALUE tmp = rb_check_array_type(arg);
01269
01270 if (!NIL_P(tmp)) {
01271 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01272 }
01273 return rb_str_format(1, &arg, str);
01274 }
01275
01276 static inline void
01277 str_modifiable(VALUE str)
01278 {
01279 if (FL_TEST(str, STR_TMPLOCK)) {
01280 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01281 }
01282 rb_check_frozen(str);
01283 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01284 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01285 }
01286
01287 static inline int
01288 str_independent(VALUE str)
01289 {
01290 str_modifiable(str);
01291 if (!STR_SHARED_P(str)) return 1;
01292 if (STR_EMBED_P(str)) return 1;
01293 return 0;
01294 }
01295
01296 static void
01297 str_make_independent_expand(VALUE str, long expand)
01298 {
01299 char *ptr;
01300 long len = RSTRING_LEN(str);
01301 long capa = len + expand;
01302
01303 if (len > capa) len = capa;
01304 ptr = ALLOC_N(char, capa + 1);
01305 if (RSTRING_PTR(str)) {
01306 memcpy(ptr, RSTRING_PTR(str), len);
01307 }
01308 STR_SET_NOEMBED(str);
01309 STR_UNSET_NOCAPA(str);
01310 ptr[len] = 0;
01311 RSTRING(str)->as.heap.ptr = ptr;
01312 RSTRING(str)->as.heap.len = len;
01313 RSTRING(str)->as.heap.aux.capa = capa;
01314 }
01315
01316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01317
01318 void
01319 rb_str_modify(VALUE str)
01320 {
01321 if (!str_independent(str))
01322 str_make_independent(str);
01323 ENC_CODERANGE_CLEAR(str);
01324 }
01325
01326 void
01327 rb_str_modify_expand(VALUE str, long expand)
01328 {
01329 if (expand < 0) {
01330 rb_raise(rb_eArgError, "negative expanding string size");
01331 }
01332 if (!str_independent(str)) {
01333 str_make_independent_expand(str, expand);
01334 }
01335 else if (expand > 0) {
01336 long len = RSTRING_LEN(str);
01337 long capa = len + expand;
01338 if (!STR_EMBED_P(str)) {
01339 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01340 RSTRING(str)->as.heap.aux.capa = capa;
01341 }
01342 else if (capa > RSTRING_EMBED_LEN_MAX) {
01343 str_make_independent_expand(str, expand);
01344 }
01345 }
01346 ENC_CODERANGE_CLEAR(str);
01347 }
01348
01349
01350 static void
01351 str_modify_keep_cr(VALUE str)
01352 {
01353 if (!str_independent(str))
01354 str_make_independent(str);
01355 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01356
01357 ENC_CODERANGE_CLEAR(str);
01358 }
01359
01360 static inline void
01361 str_discard(VALUE str)
01362 {
01363 str_modifiable(str);
01364 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01365 xfree(RSTRING_PTR(str));
01366 RSTRING(str)->as.heap.ptr = 0;
01367 RSTRING(str)->as.heap.len = 0;
01368 }
01369 }
01370
01371 void
01372 rb_str_associate(VALUE str, VALUE add)
01373 {
01374
01375 rb_check_frozen(str);
01376 if (STR_ASSOC_P(str)) {
01377
01378 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01379 }
01380 else {
01381 if (STR_SHARED_P(str)) {
01382 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01383 str_make_independent(str);
01384 if (STR_ASSOC_P(assoc)) {
01385 assoc = RSTRING(assoc)->as.heap.aux.shared;
01386 rb_ary_concat(assoc, add);
01387 add = assoc;
01388 }
01389 }
01390 else if (STR_EMBED_P(str)) {
01391 str_make_independent(str);
01392 }
01393 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01394 RESIZE_CAPA(str, RSTRING_LEN(str));
01395 }
01396 FL_SET(str, STR_ASSOC);
01397 RBASIC(add)->klass = 0;
01398 RSTRING(str)->as.heap.aux.shared = add;
01399 }
01400 }
01401
01402 VALUE
01403 rb_str_associated(VALUE str)
01404 {
01405 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01406 if (STR_ASSOC_P(str)) {
01407 return RSTRING(str)->as.heap.aux.shared;
01408 }
01409 return Qfalse;
01410 }
01411
01412 VALUE
01413 rb_string_value(volatile VALUE *ptr)
01414 {
01415 VALUE s = *ptr;
01416 if (TYPE(s) != T_STRING) {
01417 s = rb_str_to_str(s);
01418 *ptr = s;
01419 }
01420 return s;
01421 }
01422
01423 char *
01424 rb_string_value_ptr(volatile VALUE *ptr)
01425 {
01426 VALUE str = rb_string_value(ptr);
01427 return RSTRING_PTR(str);
01428 }
01429
01430 char *
01431 rb_string_value_cstr(volatile VALUE *ptr)
01432 {
01433 VALUE str = rb_string_value(ptr);
01434 char *s = RSTRING_PTR(str);
01435 long len = RSTRING_LEN(str);
01436
01437 if (!s || memchr(s, 0, len)) {
01438 rb_raise(rb_eArgError, "string contains null byte");
01439 }
01440 if (s[len]) {
01441 rb_str_modify(str);
01442 s = RSTRING_PTR(str);
01443 s[RSTRING_LEN(str)] = 0;
01444 }
01445 return s;
01446 }
01447
01448 VALUE
01449 rb_check_string_type(VALUE str)
01450 {
01451 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01452 return str;
01453 }
01454
01455
01456
01457
01458
01459
01460
01461
01462
01463
01464
01465
01466 static VALUE
01467 rb_str_s_try_convert(VALUE dummy, VALUE str)
01468 {
01469 return rb_check_string_type(str);
01470 }
01471
01472 static char*
01473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01474 {
01475 long nth = *nthp;
01476 if (rb_enc_mbmaxlen(enc) == 1) {
01477 p += nth;
01478 }
01479 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01480 p += nth * rb_enc_mbmaxlen(enc);
01481 }
01482 else if (rb_enc_asciicompat(enc)) {
01483 const char *p2, *e2;
01484 int n;
01485
01486 while (p < e && 0 < nth) {
01487 e2 = p + nth;
01488 if (e < e2) {
01489 *nthp = nth;
01490 return (char *)e;
01491 }
01492 if (ISASCII(*p)) {
01493 p2 = search_nonascii(p, e2);
01494 if (!p2) {
01495 *nthp = nth;
01496 return (char *)e2;
01497 }
01498 nth -= p2 - p;
01499 p = p2;
01500 }
01501 n = rb_enc_mbclen(p, e, enc);
01502 p += n;
01503 nth--;
01504 }
01505 *nthp = nth;
01506 if (nth != 0) {
01507 return (char *)e;
01508 }
01509 return (char *)p;
01510 }
01511 else {
01512 while (p < e && nth--) {
01513 p += rb_enc_mbclen(p, e, enc);
01514 }
01515 }
01516 if (p > e) p = e;
01517 *nthp = nth;
01518 return (char*)p;
01519 }
01520
01521 char*
01522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01523 {
01524 return str_nth_len(p, e, &nth, enc);
01525 }
01526
01527 static char*
01528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01529 {
01530 if (singlebyte)
01531 p += nth;
01532 else {
01533 p = str_nth_len(p, e, &nth, enc);
01534 }
01535 if (!p) return 0;
01536 if (p > e) p = e;
01537 return (char *)p;
01538 }
01539
01540
01541 static long
01542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01543 {
01544 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01545 if (!pp) return e - p;
01546 return pp - p;
01547 }
01548
01549 long
01550 rb_str_offset(VALUE str, long pos)
01551 {
01552 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01553 STR_ENC_GET(str), single_byte_optimizable(str));
01554 }
01555
01556 #ifdef NONASCII_MASK
01557 static char *
01558 str_utf8_nth(const char *p, const char *e, long *nthp)
01559 {
01560 long nth = *nthp;
01561 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01562 const VALUE *s, *t;
01563 const VALUE lowbits = sizeof(VALUE) - 1;
01564 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01565 t = (const VALUE*)(~lowbits & (VALUE)e);
01566 while (p < (const char *)s) {
01567 if (is_utf8_lead_byte(*p)) nth--;
01568 p++;
01569 }
01570 do {
01571 nth -= count_utf8_lead_bytes_with_word(s);
01572 s++;
01573 } while (s < t && (int)sizeof(VALUE) <= nth);
01574 p = (char *)s;
01575 }
01576 while (p < e) {
01577 if (is_utf8_lead_byte(*p)) {
01578 if (nth == 0) break;
01579 nth--;
01580 }
01581 p++;
01582 }
01583 *nthp = nth;
01584 return (char *)p;
01585 }
01586
01587 static long
01588 str_utf8_offset(const char *p, const char *e, long nth)
01589 {
01590 const char *pp = str_utf8_nth(p, e, &nth);
01591 return pp - p;
01592 }
01593 #endif
01594
01595
01596 long
01597 rb_str_sublen(VALUE str, long pos)
01598 {
01599 if (single_byte_optimizable(str) || pos < 0)
01600 return pos;
01601 else {
01602 char *p = RSTRING_PTR(str);
01603 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01604 }
01605 }
01606
01607 VALUE
01608 rb_str_subseq(VALUE str, long beg, long len)
01609 {
01610 VALUE str2;
01611
01612 if (RSTRING_LEN(str) == beg + len &&
01613 RSTRING_EMBED_LEN_MAX < len) {
01614 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01615 rb_str_drop_bytes(str2, beg);
01616 }
01617 else {
01618 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01619 }
01620
01621 rb_enc_cr_str_copy_for_substr(str2, str);
01622 OBJ_INFECT(str2, str);
01623
01624 return str2;
01625 }
01626
01627 VALUE
01628 rb_str_substr(VALUE str, long beg, long len)
01629 {
01630 rb_encoding *enc = STR_ENC_GET(str);
01631 VALUE str2;
01632 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01633
01634 if (len < 0) return Qnil;
01635 if (!RSTRING_LEN(str)) {
01636 len = 0;
01637 }
01638 if (single_byte_optimizable(str)) {
01639 if (beg > RSTRING_LEN(str)) return Qnil;
01640 if (beg < 0) {
01641 beg += RSTRING_LEN(str);
01642 if (beg < 0) return Qnil;
01643 }
01644 if (beg + len > RSTRING_LEN(str))
01645 len = RSTRING_LEN(str) - beg;
01646 if (len <= 0) {
01647 len = 0;
01648 p = 0;
01649 }
01650 else
01651 p = s + beg;
01652 goto sub;
01653 }
01654 if (beg < 0) {
01655 if (len > -beg) len = -beg;
01656 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01657 beg = -beg;
01658 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01659 p = e;
01660 if (!p) return Qnil;
01661 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01662 if (!p) return Qnil;
01663 len = e - p;
01664 goto sub;
01665 }
01666 else {
01667 beg += str_strlen(str, enc);
01668 if (beg < 0) return Qnil;
01669 }
01670 }
01671 else if (beg > 0 && beg > RSTRING_LEN(str)) {
01672 return Qnil;
01673 }
01674 if (len == 0) {
01675 if (beg > str_strlen(str, enc)) return Qnil;
01676 p = 0;
01677 }
01678 #ifdef NONASCII_MASK
01679 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01680 enc == rb_utf8_encoding()) {
01681 p = str_utf8_nth(s, e, &beg);
01682 if (beg > 0) return Qnil;
01683 len = str_utf8_offset(p, e, len);
01684 }
01685 #endif
01686 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01687 int char_sz = rb_enc_mbmaxlen(enc);
01688
01689 p = s + beg * char_sz;
01690 if (p > e) {
01691 return Qnil;
01692 }
01693 else if (len * char_sz > e - p)
01694 len = e - p;
01695 else
01696 len *= char_sz;
01697 }
01698 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01699 if (beg > 0) return Qnil;
01700 len = 0;
01701 }
01702 else {
01703 len = str_offset(p, e, len, enc, 0);
01704 }
01705 sub:
01706 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01707 str2 = rb_str_new4(str);
01708 str2 = str_new3(rb_obj_class(str2), str2);
01709 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01710 RSTRING(str2)->as.heap.len = len;
01711 }
01712 else {
01713 str2 = rb_str_new5(str, p, len);
01714 rb_enc_cr_str_copy_for_substr(str2, str);
01715 OBJ_INFECT(str2, str);
01716 }
01717
01718 return str2;
01719 }
01720
01721 VALUE
01722 rb_str_freeze(VALUE str)
01723 {
01724 if (STR_ASSOC_P(str)) {
01725 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01726 OBJ_FREEZE(ary);
01727 }
01728 return rb_obj_freeze(str);
01729 }
01730
01731 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01732 #define rb_str_dup_frozen rb_str_new_frozen
01733
01734 VALUE
01735 rb_str_locktmp(VALUE str)
01736 {
01737 if (FL_TEST(str, STR_TMPLOCK)) {
01738 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01739 }
01740 FL_SET(str, STR_TMPLOCK);
01741 return str;
01742 }
01743
01744 VALUE
01745 rb_str_unlocktmp(VALUE str)
01746 {
01747 if (!FL_TEST(str, STR_TMPLOCK)) {
01748 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01749 }
01750 FL_UNSET(str, STR_TMPLOCK);
01751 return str;
01752 }
01753
01754 void
01755 rb_str_set_len(VALUE str, long len)
01756 {
01757 long capa;
01758
01759 str_modifiable(str);
01760 if (STR_SHARED_P(str)) {
01761 rb_raise(rb_eRuntimeError, "can't set length of shared string");
01762 }
01763 if (len > (capa = (long)rb_str_capacity(str))) {
01764 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01765 }
01766 STR_SET_LEN(str, len);
01767 RSTRING_PTR(str)[len] = '\0';
01768 }
01769
01770 VALUE
01771 rb_str_resize(VALUE str, long len)
01772 {
01773 long slen;
01774 int independent;
01775
01776 if (len < 0) {
01777 rb_raise(rb_eArgError, "negative string size (or size too big)");
01778 }
01779
01780 independent = str_independent(str);
01781 ENC_CODERANGE_CLEAR(str);
01782 slen = RSTRING_LEN(str);
01783 if (len != slen) {
01784 if (STR_EMBED_P(str)) {
01785 if (len <= RSTRING_EMBED_LEN_MAX) {
01786 STR_SET_EMBED_LEN(str, len);
01787 RSTRING(str)->as.ary[len] = '\0';
01788 return str;
01789 }
01790 str_make_independent_expand(str, len - slen);
01791 STR_SET_NOEMBED(str);
01792 }
01793 else if (len <= RSTRING_EMBED_LEN_MAX) {
01794 char *ptr = RSTRING(str)->as.heap.ptr;
01795 STR_SET_EMBED(str);
01796 if (slen > len) slen = len;
01797 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01798 RSTRING(str)->as.ary[len] = '\0';
01799 STR_SET_EMBED_LEN(str, len);
01800 if (independent) xfree(ptr);
01801 return str;
01802 }
01803 else if (!independent) {
01804 str_make_independent_expand(str, len - slen);
01805 }
01806 else if (slen < len || slen - len > 1024) {
01807 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01808 }
01809 if (!STR_NOCAPA_P(str)) {
01810 RSTRING(str)->as.heap.aux.capa = len;
01811 }
01812 RSTRING(str)->as.heap.len = len;
01813 RSTRING(str)->as.heap.ptr[len] = '\0';
01814 }
01815 return str;
01816 }
01817
01818 static VALUE
01819 str_buf_cat(VALUE str, const char *ptr, long len)
01820 {
01821 long capa, total, off = -1;
01822
01823 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01824 off = ptr - RSTRING_PTR(str);
01825 }
01826 rb_str_modify(str);
01827 if (len == 0) return 0;
01828 if (STR_ASSOC_P(str)) {
01829 FL_UNSET(str, STR_ASSOC);
01830 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01831 }
01832 else if (STR_EMBED_P(str)) {
01833 capa = RSTRING_EMBED_LEN_MAX;
01834 }
01835 else {
01836 capa = RSTRING(str)->as.heap.aux.capa;
01837 }
01838 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01839 rb_raise(rb_eArgError, "string sizes too big");
01840 }
01841 total = RSTRING_LEN(str)+len;
01842 if (capa <= total) {
01843 while (total > capa) {
01844 if (capa + 1 >= LONG_MAX / 2) {
01845 capa = (total + 4095) / 4096;
01846 break;
01847 }
01848 capa = (capa + 1) * 2;
01849 }
01850 RESIZE_CAPA(str, capa);
01851 }
01852 if (off != -1) {
01853 ptr = RSTRING_PTR(str) + off;
01854 }
01855 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01856 STR_SET_LEN(str, total);
01857 RSTRING_PTR(str)[total] = '\0';
01858
01859 return str;
01860 }
01861
01862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01863
01864 VALUE
01865 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01866 {
01867 if (len == 0) return str;
01868 if (len < 0) {
01869 rb_raise(rb_eArgError, "negative string size (or size too big)");
01870 }
01871 return str_buf_cat(str, ptr, len);
01872 }
01873
01874 VALUE
01875 rb_str_buf_cat2(VALUE str, const char *ptr)
01876 {
01877 return rb_str_buf_cat(str, ptr, strlen(ptr));
01878 }
01879
01880 VALUE
01881 rb_str_cat(VALUE str, const char *ptr, long len)
01882 {
01883 if (len < 0) {
01884 rb_raise(rb_eArgError, "negative string size (or size too big)");
01885 }
01886 if (STR_ASSOC_P(str)) {
01887 char *p;
01888 rb_str_modify_expand(str, len);
01889 p = RSTRING(str)->as.heap.ptr;
01890 memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01891 len = RSTRING(str)->as.heap.len += len;
01892 p[len] = '\0';
01893 return str;
01894 }
01895
01896 return rb_str_buf_cat(str, ptr, len);
01897 }
01898
01899 VALUE
01900 rb_str_cat2(VALUE str, const char *ptr)
01901 {
01902 return rb_str_cat(str, ptr, strlen(ptr));
01903 }
01904
01905 static VALUE
01906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01907 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01908 {
01909 int str_encindex = ENCODING_GET(str);
01910 int res_encindex;
01911 int str_cr, res_cr;
01912 int ptr_a8 = ptr_encindex == 0;
01913
01914 str_cr = ENC_CODERANGE(str);
01915
01916 if (str_encindex == ptr_encindex) {
01917 if (str_cr == ENC_CODERANGE_UNKNOWN ||
01918 (ptr_a8 && str_cr != ENC_CODERANGE_7BIT)) {
01919 ptr_cr = ENC_CODERANGE_UNKNOWN;
01920 }
01921 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01922 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01923 }
01924 }
01925 else {
01926 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01927 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01928 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01929 if (len == 0)
01930 return str;
01931 if (RSTRING_LEN(str) == 0) {
01932 rb_str_buf_cat(str, ptr, len);
01933 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01934 return str;
01935 }
01936 goto incompatible;
01937 }
01938 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01939 ptr_cr = coderange_scan(ptr, len, ptr_enc);
01940 }
01941 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01942 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
01943 str_cr = rb_enc_str_coderange(str);
01944 }
01945 }
01946 }
01947 if (ptr_cr_ret)
01948 *ptr_cr_ret = ptr_cr;
01949
01950 if (str_encindex != ptr_encindex &&
01951 str_cr != ENC_CODERANGE_7BIT &&
01952 ptr_cr != ENC_CODERANGE_7BIT) {
01953 incompatible:
01954 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01955 rb_enc_name(rb_enc_from_index(str_encindex)),
01956 rb_enc_name(rb_enc_from_index(ptr_encindex)));
01957 }
01958
01959 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01960 res_encindex = str_encindex;
01961 res_cr = ENC_CODERANGE_UNKNOWN;
01962 }
01963 else if (str_cr == ENC_CODERANGE_7BIT) {
01964 if (ptr_cr == ENC_CODERANGE_7BIT) {
01965 res_encindex = str_encindex;
01966 res_cr = ENC_CODERANGE_7BIT;
01967 }
01968 else {
01969 res_encindex = ptr_encindex;
01970 res_cr = ptr_cr;
01971 }
01972 }
01973 else if (str_cr == ENC_CODERANGE_VALID) {
01974 res_encindex = str_encindex;
01975 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01976 res_cr = str_cr;
01977 else
01978 res_cr = ptr_cr;
01979 }
01980 else {
01981 res_encindex = str_encindex;
01982 res_cr = str_cr;
01983 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01984 }
01985
01986 if (len < 0) {
01987 rb_raise(rb_eArgError, "negative string size (or size too big)");
01988 }
01989 str_buf_cat(str, ptr, len);
01990 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01991 return str;
01992 }
01993
01994 VALUE
01995 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01996 {
01997 return rb_enc_cr_str_buf_cat(str, ptr, len,
01998 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01999 }
02000
02001 VALUE
02002 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02003 {
02004
02005 int encindex = ENCODING_GET(str);
02006 rb_encoding *enc = rb_enc_from_index(encindex);
02007 if (rb_enc_asciicompat(enc)) {
02008 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02009 encindex, ENC_CODERANGE_7BIT, 0);
02010 }
02011 else {
02012 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02013 while (*ptr) {
02014 unsigned int c = (unsigned char)*ptr;
02015 int len = rb_enc_codelen(c, enc);
02016 rb_enc_mbcput(c, buf, enc);
02017 rb_enc_cr_str_buf_cat(str, buf, len,
02018 encindex, ENC_CODERANGE_VALID, 0);
02019 ptr++;
02020 }
02021 return str;
02022 }
02023 }
02024
02025 VALUE
02026 rb_str_buf_append(VALUE str, VALUE str2)
02027 {
02028 int str2_cr;
02029
02030 str2_cr = ENC_CODERANGE(str2);
02031
02032 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02033 ENCODING_GET(str2), str2_cr, &str2_cr);
02034
02035 OBJ_INFECT(str, str2);
02036 ENC_CODERANGE_SET(str2, str2_cr);
02037
02038 return str;
02039 }
02040
02041 VALUE
02042 rb_str_append(VALUE str, VALUE str2)
02043 {
02044 rb_encoding *enc;
02045 int cr, cr2;
02046 long len2;
02047
02048 StringValue(str2);
02049 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02050 long len = RSTRING_LEN(str) + len2;
02051 enc = rb_enc_check(str, str2);
02052 cr = ENC_CODERANGE(str);
02053 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02054 rb_str_modify_expand(str, len2);
02055 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02056 RSTRING_PTR(str2), len2+1);
02057 RSTRING(str)->as.heap.len = len;
02058 rb_enc_associate(str, enc);
02059 ENC_CODERANGE_SET(str, cr);
02060 OBJ_INFECT(str, str2);
02061 return str;
02062 }
02063 return rb_str_buf_append(str, str2);
02064 }
02065
02066
02067
02068
02069
02070
02071
02072
02073
02074
02075
02076
02077
02078
02079
02080
02081
02082 VALUE
02083 rb_str_concat(VALUE str1, VALUE str2)
02084 {
02085 unsigned int code;
02086 rb_encoding *enc = STR_ENC_GET(str1);
02087
02088 if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
02089 if (rb_num_to_uint(str2, &code) == 0) {
02090 }
02091 else if (FIXNUM_P(str2)) {
02092 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02093 }
02094 else {
02095 rb_raise(rb_eRangeError, "bignum out of char range");
02096 }
02097 }
02098 else {
02099 return rb_str_append(str1, str2);
02100 }
02101
02102 if (enc == rb_usascii_encoding()) {
02103
02104 char buf[1] = {(char)code};
02105 if (code > 0xFF) {
02106 rb_raise(rb_eRangeError, "%u out of char range", code);
02107 }
02108 rb_str_cat(str1, buf, 1);
02109 if (code > 127) {
02110 rb_enc_associate(str1, rb_ascii8bit_encoding());
02111 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02112 }
02113 }
02114 else {
02115 long pos = RSTRING_LEN(str1);
02116 int cr = ENC_CODERANGE(str1);
02117 int len;
02118 char *buf;
02119
02120 switch (len = rb_enc_codelen(code, enc)) {
02121 case ONIGERR_INVALID_CODE_POINT_VALUE:
02122 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02123 break;
02124 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02125 case 0:
02126 rb_raise(rb_eRangeError, "%u out of char range", code);
02127 break;
02128 }
02129 buf = ALLOCA_N(char, len + 1);
02130 rb_enc_mbcput(code, buf, enc);
02131 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02132 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02133 }
02134 rb_str_resize(str1, pos+len);
02135 strncpy(RSTRING_PTR(str1) + pos, buf, len);
02136 if (cr == ENC_CODERANGE_7BIT && code > 127)
02137 cr = ENC_CODERANGE_VALID;
02138 ENC_CODERANGE_SET(str1, cr);
02139 }
02140 return str1;
02141 }
02142
02143
02144
02145
02146
02147
02148
02149
02150
02151
02152
02153
02154 static VALUE
02155 rb_str_prepend(VALUE str, VALUE str2)
02156 {
02157 StringValue(str2);
02158 StringValue(str);
02159 rb_str_update(str, 0L, 0L, str2);
02160 return str;
02161 }
02162
02163 st_index_t
02164 rb_memhash(const void *ptr, long len)
02165 {
02166 return st_hash(ptr, len, rb_hash_start((st_index_t)len));
02167 }
02168
02169 st_index_t
02170 rb_str_hash(VALUE str)
02171 {
02172 int e = ENCODING_GET(str);
02173 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02174 e = 0;
02175 }
02176 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02177 }
02178
02179 int
02180 rb_str_hash_cmp(VALUE str1, VALUE str2)
02181 {
02182 long len;
02183
02184 if (!rb_str_comparable(str1, str2)) return 1;
02185 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02186 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02187 return 0;
02188 }
02189 return 1;
02190 }
02191
02192
02193
02194
02195
02196
02197
02198
02199 static VALUE
02200 rb_str_hash_m(VALUE str)
02201 {
02202 st_index_t hval = rb_str_hash(str);
02203 return INT2FIX(hval);
02204 }
02205
02206 #define lesser(a,b) (((a)>(b))?(b):(a))
02207
02208 int
02209 rb_str_comparable(VALUE str1, VALUE str2)
02210 {
02211 int idx1, idx2;
02212 int rc1, rc2;
02213
02214 if (RSTRING_LEN(str1) == 0) return TRUE;
02215 if (RSTRING_LEN(str2) == 0) return TRUE;
02216 idx1 = ENCODING_GET(str1);
02217 idx2 = ENCODING_GET(str2);
02218 if (idx1 == idx2) return TRUE;
02219 rc1 = rb_enc_str_coderange(str1);
02220 rc2 = rb_enc_str_coderange(str2);
02221 if (rc1 == ENC_CODERANGE_7BIT) {
02222 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02223 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02224 return TRUE;
02225 }
02226 if (rc2 == ENC_CODERANGE_7BIT) {
02227 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02228 return TRUE;
02229 }
02230 return FALSE;
02231 }
02232
02233 int
02234 rb_str_cmp(VALUE str1, VALUE str2)
02235 {
02236 long len1, len2;
02237 const char *ptr1, *ptr2;
02238 int retval;
02239
02240 if (str1 == str2) return 0;
02241 RSTRING_GETMEM(str1, ptr1, len1);
02242 RSTRING_GETMEM(str2, ptr2, len2);
02243 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02244 if (len1 == len2) {
02245 if (!rb_str_comparable(str1, str2)) {
02246 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02247 return 1;
02248 return -1;
02249 }
02250 return 0;
02251 }
02252 if (len1 > len2) return 1;
02253 return -1;
02254 }
02255 if (retval > 0) return 1;
02256 return -1;
02257 }
02258
02259
02260 static VALUE
02261 str_eql(const VALUE str1, const VALUE str2)
02262 {
02263 const long len = RSTRING_LEN(str1);
02264 const char *ptr1, *ptr2;
02265
02266 if (len != RSTRING_LEN(str2)) return Qfalse;
02267 if (!rb_str_comparable(str1, str2)) return Qfalse;
02268 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02269 return Qtrue;
02270 if (memcmp(ptr1, ptr2, len) == 0)
02271 return Qtrue;
02272 return Qfalse;
02273 }
02274
02275
02276
02277
02278
02279
02280
02281
02282
02283 VALUE
02284 rb_str_equal(VALUE str1, VALUE str2)
02285 {
02286 if (str1 == str2) return Qtrue;
02287 if (TYPE(str2) != T_STRING) {
02288 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02289 return Qfalse;
02290 }
02291 return rb_equal(str2, str1);
02292 }
02293 return str_eql(str1, str2);
02294 }
02295
02296
02297
02298
02299
02300
02301
02302
02303 static VALUE
02304 rb_str_eql(VALUE str1, VALUE str2)
02305 {
02306 if (str1 == str2) return Qtrue;
02307 if (TYPE(str2) != T_STRING) return Qfalse;
02308 return str_eql(str1, str2);
02309 }
02310
02311
02312
02313
02314
02315
02316
02317
02318
02319
02320
02321
02322
02323
02324
02325
02326
02327
02328
02329
02330
02331
02332
02333
02334 static VALUE
02335 rb_str_cmp_m(VALUE str1, VALUE str2)
02336 {
02337 long result;
02338
02339 if (TYPE(str2) != T_STRING) {
02340 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02341 return Qnil;
02342 }
02343 else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02344 return Qnil;
02345 }
02346 else {
02347 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02348
02349 if (NIL_P(tmp)) return Qnil;
02350 if (!FIXNUM_P(tmp)) {
02351 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02352 }
02353 result = -FIX2LONG(tmp);
02354 }
02355 }
02356 else {
02357 result = rb_str_cmp(str1, str2);
02358 }
02359 return LONG2NUM(result);
02360 }
02361
02362
02363
02364
02365
02366
02367
02368
02369
02370
02371
02372
02373
02374 static VALUE
02375 rb_str_casecmp(VALUE str1, VALUE str2)
02376 {
02377 long len;
02378 rb_encoding *enc;
02379 char *p1, *p1end, *p2, *p2end;
02380
02381 StringValue(str2);
02382 enc = rb_enc_compatible(str1, str2);
02383 if (!enc) {
02384 return Qnil;
02385 }
02386
02387 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02388 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02389 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02390 while (p1 < p1end && p2 < p2end) {
02391 if (*p1 != *p2) {
02392 unsigned int c1 = TOUPPER(*p1 & 0xff);
02393 unsigned int c2 = TOUPPER(*p2 & 0xff);
02394 if (c1 != c2)
02395 return INT2FIX(c1 < c2 ? -1 : 1);
02396 }
02397 p1++;
02398 p2++;
02399 }
02400 }
02401 else {
02402 while (p1 < p1end && p2 < p2end) {
02403 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02404 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02405
02406 if (0 <= c1 && 0 <= c2) {
02407 c1 = TOUPPER(c1);
02408 c2 = TOUPPER(c2);
02409 if (c1 != c2)
02410 return INT2FIX(c1 < c2 ? -1 : 1);
02411 }
02412 else {
02413 int r;
02414 l1 = rb_enc_mbclen(p1, p1end, enc);
02415 l2 = rb_enc_mbclen(p2, p2end, enc);
02416 len = l1 < l2 ? l1 : l2;
02417 r = memcmp(p1, p2, len);
02418 if (r != 0)
02419 return INT2FIX(r < 0 ? -1 : 1);
02420 if (l1 != l2)
02421 return INT2FIX(l1 < l2 ? -1 : 1);
02422 }
02423 p1 += l1;
02424 p2 += l2;
02425 }
02426 }
02427 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02428 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02429 return INT2FIX(-1);
02430 }
02431
02432 static long
02433 rb_str_index(VALUE str, VALUE sub, long offset)
02434 {
02435 long pos;
02436 char *s, *sptr, *e;
02437 long len, slen;
02438 rb_encoding *enc;
02439
02440 enc = rb_enc_check(str, sub);
02441 if (is_broken_string(sub)) {
02442 return -1;
02443 }
02444 len = str_strlen(str, enc);
02445 slen = str_strlen(sub, enc);
02446 if (offset < 0) {
02447 offset += len;
02448 if (offset < 0) return -1;
02449 }
02450 if (len - offset < slen) return -1;
02451 s = RSTRING_PTR(str);
02452 e = s + RSTRING_LEN(str);
02453 if (offset) {
02454 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02455 s += offset;
02456 }
02457 if (slen == 0) return offset;
02458
02459 sptr = RSTRING_PTR(sub);
02460 slen = RSTRING_LEN(sub);
02461 len = RSTRING_LEN(str) - offset;
02462 for (;;) {
02463 char *t;
02464 pos = rb_memsearch(sptr, slen, s, len, enc);
02465 if (pos < 0) return pos;
02466 t = rb_enc_right_char_head(s, s+pos, e, enc);
02467 if (t == s + pos) break;
02468 if ((len -= t - s) <= 0) return -1;
02469 offset += t - s;
02470 s = t;
02471 }
02472 return pos + offset;
02473 }
02474
02475
02476
02477
02478
02479
02480
02481
02482
02483
02484
02485
02486
02487
02488
02489
02490
02491
02492
02493 static VALUE
02494 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02495 {
02496 VALUE sub;
02497 VALUE initpos;
02498 long pos;
02499
02500 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02501 pos = NUM2LONG(initpos);
02502 }
02503 else {
02504 pos = 0;
02505 }
02506 if (pos < 0) {
02507 pos += str_strlen(str, STR_ENC_GET(str));
02508 if (pos < 0) {
02509 if (TYPE(sub) == T_REGEXP) {
02510 rb_backref_set(Qnil);
02511 }
02512 return Qnil;
02513 }
02514 }
02515
02516 switch (TYPE(sub)) {
02517 case T_REGEXP:
02518 if (pos > str_strlen(str, STR_ENC_GET(str)))
02519 return Qnil;
02520 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02521 rb_enc_check(str, sub), single_byte_optimizable(str));
02522
02523 pos = rb_reg_search(sub, str, pos, 0);
02524 pos = rb_str_sublen(str, pos);
02525 break;
02526
02527 default: {
02528 VALUE tmp;
02529
02530 tmp = rb_check_string_type(sub);
02531 if (NIL_P(tmp)) {
02532 rb_raise(rb_eTypeError, "type mismatch: %s given",
02533 rb_obj_classname(sub));
02534 }
02535 sub = tmp;
02536 }
02537
02538 case T_STRING:
02539 pos = rb_str_index(str, sub, pos);
02540 pos = rb_str_sublen(str, pos);
02541 break;
02542 }
02543
02544 if (pos == -1) return Qnil;
02545 return LONG2NUM(pos);
02546 }
02547
02548 static long
02549 rb_str_rindex(VALUE str, VALUE sub, long pos)
02550 {
02551 long len, slen;
02552 char *s, *sbeg, *e, *t;
02553 rb_encoding *enc;
02554 int singlebyte = single_byte_optimizable(str);
02555
02556 enc = rb_enc_check(str, sub);
02557 if (is_broken_string(sub)) {
02558 return -1;
02559 }
02560 len = str_strlen(str, enc);
02561 slen = str_strlen(sub, enc);
02562
02563 if (len < slen) return -1;
02564 if (len - pos < slen) {
02565 pos = len - slen;
02566 }
02567 if (len == 0) {
02568 return pos;
02569 }
02570 sbeg = RSTRING_PTR(str);
02571 e = RSTRING_END(str);
02572 t = RSTRING_PTR(sub);
02573 slen = RSTRING_LEN(sub);
02574 s = str_nth(sbeg, e, pos, enc, singlebyte);
02575 while (s) {
02576 if (memcmp(s, t, slen) == 0) {
02577 return pos;
02578 }
02579 if (pos == 0) break;
02580 pos--;
02581 s = rb_enc_prev_char(sbeg, s, e, enc);
02582 }
02583 return -1;
02584 }
02585
02586
02587
02588
02589
02590
02591
02592
02593
02594
02595
02596
02597
02598
02599
02600
02601
02602
02603
02604
02605 static VALUE
02606 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02607 {
02608 VALUE sub;
02609 VALUE vpos;
02610 rb_encoding *enc = STR_ENC_GET(str);
02611 long pos, len = str_strlen(str, enc);
02612
02613 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02614 pos = NUM2LONG(vpos);
02615 if (pos < 0) {
02616 pos += len;
02617 if (pos < 0) {
02618 if (TYPE(sub) == T_REGEXP) {
02619 rb_backref_set(Qnil);
02620 }
02621 return Qnil;
02622 }
02623 }
02624 if (pos > len) pos = len;
02625 }
02626 else {
02627 pos = len;
02628 }
02629
02630 switch (TYPE(sub)) {
02631 case T_REGEXP:
02632
02633 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02634 STR_ENC_GET(str), single_byte_optimizable(str));
02635
02636 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02637 pos = rb_reg_search(sub, str, pos, 1);
02638 pos = rb_str_sublen(str, pos);
02639 }
02640 if (pos >= 0) return LONG2NUM(pos);
02641 break;
02642
02643 default: {
02644 VALUE tmp;
02645
02646 tmp = rb_check_string_type(sub);
02647 if (NIL_P(tmp)) {
02648 rb_raise(rb_eTypeError, "type mismatch: %s given",
02649 rb_obj_classname(sub));
02650 }
02651 sub = tmp;
02652 }
02653
02654 case T_STRING:
02655 pos = rb_str_rindex(str, sub, pos);
02656 if (pos >= 0) return LONG2NUM(pos);
02657 break;
02658 }
02659 return Qnil;
02660 }
02661
02662
02663
02664
02665
02666
02667
02668
02669
02670
02671
02672
02673
02674
02675
02676 static VALUE
02677 rb_str_match(VALUE x, VALUE y)
02678 {
02679 switch (TYPE(y)) {
02680 case T_STRING:
02681 rb_raise(rb_eTypeError, "type mismatch: String given");
02682
02683 case T_REGEXP:
02684 return rb_reg_match(y, x);
02685
02686 default:
02687 return rb_funcall(y, rb_intern("=~"), 1, x);
02688 }
02689 }
02690
02691
02692 static VALUE get_pat(VALUE, int);
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715
02716
02717
02718
02719
02720
02721
02722
02723
02724 static VALUE
02725 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02726 {
02727 VALUE re, result;
02728 if (argc < 1)
02729 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02730 re = argv[0];
02731 argv[0] = str;
02732 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02733 if (!NIL_P(result) && rb_block_given_p()) {
02734 return rb_yield(result);
02735 }
02736 return result;
02737 }
02738
02739 enum neighbor_char {
02740 NEIGHBOR_NOT_CHAR,
02741 NEIGHBOR_FOUND,
02742 NEIGHBOR_WRAPPED
02743 };
02744
02745 static enum neighbor_char
02746 enc_succ_char(char *p, long len, rb_encoding *enc)
02747 {
02748 long i;
02749 int l;
02750 while (1) {
02751 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02752 p[i] = '\0';
02753 if (i < 0)
02754 return NEIGHBOR_WRAPPED;
02755 ++((unsigned char*)p)[i];
02756 l = rb_enc_precise_mbclen(p, p+len, enc);
02757 if (MBCLEN_CHARFOUND_P(l)) {
02758 l = MBCLEN_CHARFOUND_LEN(l);
02759 if (l == len) {
02760 return NEIGHBOR_FOUND;
02761 }
02762 else {
02763 memset(p+l, 0xff, len-l);
02764 }
02765 }
02766 if (MBCLEN_INVALID_P(l) && i < len-1) {
02767 long len2;
02768 int l2;
02769 for (len2 = len-1; 0 < len2; len2--) {
02770 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02771 if (!MBCLEN_INVALID_P(l2))
02772 break;
02773 }
02774 memset(p+len2+1, 0xff, len-(len2+1));
02775 }
02776 }
02777 }
02778
02779 static enum neighbor_char
02780 enc_pred_char(char *p, long len, rb_encoding *enc)
02781 {
02782 long i;
02783 int l;
02784 while (1) {
02785 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02786 p[i] = '\xff';
02787 if (i < 0)
02788 return NEIGHBOR_WRAPPED;
02789 --((unsigned char*)p)[i];
02790 l = rb_enc_precise_mbclen(p, p+len, enc);
02791 if (MBCLEN_CHARFOUND_P(l)) {
02792 l = MBCLEN_CHARFOUND_LEN(l);
02793 if (l == len) {
02794 return NEIGHBOR_FOUND;
02795 }
02796 else {
02797 memset(p+l, 0, len-l);
02798 }
02799 }
02800 if (MBCLEN_INVALID_P(l) && i < len-1) {
02801 long len2;
02802 int l2;
02803 for (len2 = len-1; 0 < len2; len2--) {
02804 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02805 if (!MBCLEN_INVALID_P(l2))
02806 break;
02807 }
02808 memset(p+len2+1, 0, len-(len2+1));
02809 }
02810 }
02811 }
02812
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822 static enum neighbor_char
02823 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02824 {
02825 enum neighbor_char ret;
02826 unsigned int c;
02827 int ctype;
02828 int range;
02829 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02830
02831 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02832 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02833 ctype = ONIGENC_CTYPE_DIGIT;
02834 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02835 ctype = ONIGENC_CTYPE_ALPHA;
02836 else
02837 return NEIGHBOR_NOT_CHAR;
02838
02839 MEMCPY(save, p, char, len);
02840 ret = enc_succ_char(p, len, enc);
02841 if (ret == NEIGHBOR_FOUND) {
02842 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02843 if (rb_enc_isctype(c, ctype, enc))
02844 return NEIGHBOR_FOUND;
02845 }
02846 MEMCPY(p, save, char, len);
02847 range = 1;
02848 while (1) {
02849 MEMCPY(save, p, char, len);
02850 ret = enc_pred_char(p, len, enc);
02851 if (ret == NEIGHBOR_FOUND) {
02852 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02853 if (!rb_enc_isctype(c, ctype, enc)) {
02854 MEMCPY(p, save, char, len);
02855 break;
02856 }
02857 }
02858 else {
02859 MEMCPY(p, save, char, len);
02860 break;
02861 }
02862 range++;
02863 }
02864 if (range == 1) {
02865 return NEIGHBOR_NOT_CHAR;
02866 }
02867
02868 if (ctype != ONIGENC_CTYPE_DIGIT) {
02869 MEMCPY(carry, p, char, len);
02870 return NEIGHBOR_WRAPPED;
02871 }
02872
02873 MEMCPY(carry, p, char, len);
02874 enc_succ_char(carry, len, enc);
02875 return NEIGHBOR_WRAPPED;
02876 }
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895
02896
02897
02898
02899
02900
02901
02902
02903
02904 VALUE
02905 rb_str_succ(VALUE orig)
02906 {
02907 rb_encoding *enc;
02908 VALUE str;
02909 char *sbeg, *s, *e, *last_alnum = 0;
02910 int c = -1;
02911 long l;
02912 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02913 long carry_pos = 0, carry_len = 1;
02914 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02915
02916 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02917 rb_enc_cr_str_copy_for_substr(str, orig);
02918 OBJ_INFECT(str, orig);
02919 if (RSTRING_LEN(str) == 0) return str;
02920
02921 enc = STR_ENC_GET(orig);
02922 sbeg = RSTRING_PTR(str);
02923 s = e = sbeg + RSTRING_LEN(str);
02924
02925 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02926 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02927 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02928 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02929 s = last_alnum;
02930 break;
02931 }
02932 }
02933 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02934 neighbor = enc_succ_alnum_char(s, l, enc, carry);
02935 switch (neighbor) {
02936 case NEIGHBOR_NOT_CHAR:
02937 continue;
02938 case NEIGHBOR_FOUND:
02939 return str;
02940 case NEIGHBOR_WRAPPED:
02941 last_alnum = s;
02942 break;
02943 }
02944 c = 1;
02945 carry_pos = s - sbeg;
02946 carry_len = l;
02947 }
02948 if (c == -1) {
02949 s = e;
02950 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02951 enum neighbor_char neighbor;
02952 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02953 neighbor = enc_succ_char(s, l, enc);
02954 if (neighbor == NEIGHBOR_FOUND)
02955 return str;
02956 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02957
02958 enc_succ_char(s, l, enc);
02959 }
02960 if (!rb_enc_asciicompat(enc)) {
02961 MEMCPY(carry, s, char, l);
02962 carry_len = l;
02963 }
02964 carry_pos = s - sbeg;
02965 }
02966 }
02967 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02968 s = RSTRING_PTR(str) + carry_pos;
02969 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02970 memmove(s, carry, carry_len);
02971 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02972 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02973 rb_enc_str_coderange(str);
02974 return str;
02975 }
02976
02977
02978
02979
02980
02981
02982
02983
02984
02985
02986
02987 static VALUE
02988 rb_str_succ_bang(VALUE str)
02989 {
02990 rb_str_shared_replace(str, rb_str_succ(str));
02991
02992 return str;
02993 }
02994
02995
02996
02997
02998
02999
03000
03001
03002
03003
03004
03005
03006
03007
03008
03009
03010
03011
03012
03013
03014
03015
03016
03017
03018
03019
03020
03021
03022
03023
03024
03025
03026
03027
03028 static VALUE
03029 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03030 {
03031 VALUE end, exclusive;
03032 VALUE current, after_end;
03033 ID succ;
03034 int n, excl, ascii;
03035 rb_encoding *enc;
03036
03037 rb_scan_args(argc, argv, "11", &end, &exclusive);
03038 RETURN_ENUMERATOR(beg, argc, argv);
03039 excl = RTEST(exclusive);
03040 CONST_ID(succ, "succ");
03041 StringValue(end);
03042 enc = rb_enc_check(beg, end);
03043 ascii = (is_ascii_string(beg) && is_ascii_string(end));
03044
03045 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03046 char c = RSTRING_PTR(beg)[0];
03047 char e = RSTRING_PTR(end)[0];
03048
03049 if (c > e || (excl && c == e)) return beg;
03050 for (;;) {
03051 rb_yield(rb_enc_str_new(&c, 1, enc));
03052 if (!excl && c == e) break;
03053 c++;
03054 if (excl && c == e) break;
03055 }
03056 return beg;
03057 }
03058
03059 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03060 char *s, *send;
03061 VALUE b, e;
03062 int width;
03063
03064 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03065 width = rb_long2int(send - s);
03066 while (s < send) {
03067 if (!ISDIGIT(*s)) goto no_digits;
03068 s++;
03069 }
03070 s = RSTRING_PTR(end); send = RSTRING_END(end);
03071 while (s < send) {
03072 if (!ISDIGIT(*s)) goto no_digits;
03073 s++;
03074 }
03075 b = rb_str_to_inum(beg, 10, FALSE);
03076 e = rb_str_to_inum(end, 10, FALSE);
03077 if (FIXNUM_P(b) && FIXNUM_P(e)) {
03078 long bi = FIX2LONG(b);
03079 long ei = FIX2LONG(e);
03080 rb_encoding *usascii = rb_usascii_encoding();
03081
03082 while (bi <= ei) {
03083 if (excl && bi == ei) break;
03084 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03085 bi++;
03086 }
03087 }
03088 else {
03089 ID op = excl ? '<' : rb_intern("<=");
03090 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03091
03092 args[0] = INT2FIX(width);
03093 while (rb_funcall(b, op, 1, e)) {
03094 args[1] = b;
03095 rb_yield(rb_str_format(numberof(args), args, fmt));
03096 b = rb_funcall(b, succ, 0, 0);
03097 }
03098 }
03099 return beg;
03100 }
03101
03102 no_digits:
03103 n = rb_str_cmp(beg, end);
03104 if (n > 0 || (excl && n == 0)) return beg;
03105
03106 after_end = rb_funcall(end, succ, 0, 0);
03107 current = rb_str_dup(beg);
03108 while (!rb_str_equal(current, after_end)) {
03109 VALUE next = Qnil;
03110 if (excl || !rb_str_equal(current, end))
03111 next = rb_funcall(current, succ, 0, 0);
03112 rb_yield(current);
03113 if (NIL_P(next)) break;
03114 current = next;
03115 StringValue(current);
03116 if (excl && rb_str_equal(current, end)) break;
03117 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03118 break;
03119 }
03120
03121 return beg;
03122 }
03123
03124 static VALUE
03125 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03126 {
03127 if (rb_reg_search(re, str, 0, 0) >= 0) {
03128 VALUE match = rb_backref_get();
03129 int nth = rb_reg_backref_number(match, backref);
03130 return rb_reg_nth_match(nth, match);
03131 }
03132 return Qnil;
03133 }
03134
03135 static VALUE
03136 rb_str_aref(VALUE str, VALUE indx)
03137 {
03138 long idx;
03139
03140 switch (TYPE(indx)) {
03141 case T_FIXNUM:
03142 idx = FIX2LONG(indx);
03143
03144 num_index:
03145 str = rb_str_substr(str, idx, 1);
03146 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03147 return str;
03148
03149 case T_REGEXP:
03150 return rb_str_subpat(str, indx, INT2FIX(0));
03151
03152 case T_STRING:
03153 if (rb_str_index(str, indx, 0) != -1)
03154 return rb_str_dup(indx);
03155 return Qnil;
03156
03157 default:
03158
03159 {
03160 long beg, len;
03161 VALUE tmp;
03162
03163 len = str_strlen(str, STR_ENC_GET(str));
03164 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03165 case Qfalse:
03166 break;
03167 case Qnil:
03168 return Qnil;
03169 default:
03170 tmp = rb_str_substr(str, beg, len);
03171 return tmp;
03172 }
03173 }
03174 idx = NUM2LONG(indx);
03175 goto num_index;
03176 }
03177 return Qnil;
03178 }
03179
03180
03181
03182
03183
03184
03185
03186
03187
03188
03189
03190
03191
03192
03193
03194
03195
03196
03197
03198
03199
03200
03201
03202
03203
03204
03205
03206
03207
03208
03209
03210
03211
03212
03213
03214
03215
03216
03217
03218
03219
03220
03221
03222
03223
03224
03225
03226
03227
03228
03229
03230 static VALUE
03231 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03232 {
03233 if (argc == 2) {
03234 if (TYPE(argv[0]) == T_REGEXP) {
03235 return rb_str_subpat(str, argv[0], argv[1]);
03236 }
03237 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03238 }
03239 if (argc != 1) {
03240 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03241 }
03242 return rb_str_aref(str, argv[0]);
03243 }
03244
03245 VALUE
03246 rb_str_drop_bytes(VALUE str, long len)
03247 {
03248 char *ptr = RSTRING_PTR(str);
03249 long olen = RSTRING_LEN(str), nlen;
03250
03251 str_modifiable(str);
03252 if (len > olen) len = olen;
03253 nlen = olen - len;
03254 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03255 char *oldptr = ptr;
03256 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03257 STR_SET_EMBED(str);
03258 STR_SET_EMBED_LEN(str, nlen);
03259 ptr = RSTRING(str)->as.ary;
03260 memmove(ptr, oldptr + len, nlen);
03261 if (fl == STR_NOEMBED) xfree(oldptr);
03262 }
03263 else {
03264 if (!STR_SHARED_P(str)) rb_str_new4(str);
03265 ptr = RSTRING(str)->as.heap.ptr += len;
03266 RSTRING(str)->as.heap.len = nlen;
03267 }
03268 ptr[nlen] = 0;
03269 ENC_CODERANGE_CLEAR(str);
03270 return str;
03271 }
03272
03273 static void
03274 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03275 {
03276 if (beg == 0 && RSTRING_LEN(val) == 0) {
03277 rb_str_drop_bytes(str, len);
03278 OBJ_INFECT(str, val);
03279 return;
03280 }
03281
03282 rb_str_modify(str);
03283 if (len < RSTRING_LEN(val)) {
03284
03285 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03286 }
03287
03288 if (RSTRING_LEN(val) != len) {
03289 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03290 RSTRING_PTR(str) + beg + len,
03291 RSTRING_LEN(str) - (beg + len));
03292 }
03293 if (RSTRING_LEN(val) < beg && len < 0) {
03294 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03295 }
03296 if (RSTRING_LEN(val) > 0) {
03297 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03298 }
03299 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03300 if (RSTRING_PTR(str)) {
03301 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03302 }
03303 OBJ_INFECT(str, val);
03304 }
03305
03306 static void
03307 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03308 {
03309 long slen;
03310 char *p, *e;
03311 rb_encoding *enc;
03312 int singlebyte = single_byte_optimizable(str);
03313 int cr;
03314
03315 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03316
03317 StringValue(val);
03318 enc = rb_enc_check(str, val);
03319 slen = str_strlen(str, enc);
03320
03321 if (slen < beg) {
03322 out_of_range:
03323 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03324 }
03325 if (beg < 0) {
03326 if (-beg > slen) {
03327 goto out_of_range;
03328 }
03329 beg += slen;
03330 }
03331 if (slen < len || slen < beg + len) {
03332 len = slen - beg;
03333 }
03334 str_modify_keep_cr(str);
03335 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03336 if (!p) p = RSTRING_END(str);
03337 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03338 if (!e) e = RSTRING_END(str);
03339
03340 beg = p - RSTRING_PTR(str);
03341 len = e - p;
03342 rb_str_splice_0(str, beg, len, val);
03343 rb_enc_associate(str, enc);
03344 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03345 if (cr != ENC_CODERANGE_BROKEN)
03346 ENC_CODERANGE_SET(str, cr);
03347 }
03348
03349 void
03350 rb_str_update(VALUE str, long beg, long len, VALUE val)
03351 {
03352 rb_str_splice(str, beg, len, val);
03353 }
03354
03355 static void
03356 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03357 {
03358 int nth;
03359 VALUE match;
03360 long start, end, len;
03361 rb_encoding *enc;
03362 struct re_registers *regs;
03363
03364 if (rb_reg_search(re, str, 0, 0) < 0) {
03365 rb_raise(rb_eIndexError, "regexp not matched");
03366 }
03367 match = rb_backref_get();
03368 nth = rb_reg_backref_number(match, backref);
03369 regs = RMATCH_REGS(match);
03370 if (nth >= regs->num_regs) {
03371 out_of_range:
03372 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03373 }
03374 if (nth < 0) {
03375 if (-nth >= regs->num_regs) {
03376 goto out_of_range;
03377 }
03378 nth += regs->num_regs;
03379 }
03380
03381 start = BEG(nth);
03382 if (start == -1) {
03383 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03384 }
03385 end = END(nth);
03386 len = end - start;
03387 StringValue(val);
03388 enc = rb_enc_check(str, val);
03389 rb_str_splice_0(str, start, len, val);
03390 rb_enc_associate(str, enc);
03391 }
03392
03393 static VALUE
03394 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03395 {
03396 long idx, beg;
03397
03398 switch (TYPE(indx)) {
03399 case T_FIXNUM:
03400 idx = FIX2LONG(indx);
03401 num_index:
03402 rb_str_splice(str, idx, 1, val);
03403 return val;
03404
03405 case T_REGEXP:
03406 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03407 return val;
03408
03409 case T_STRING:
03410 beg = rb_str_index(str, indx, 0);
03411 if (beg < 0) {
03412 rb_raise(rb_eIndexError, "string not matched");
03413 }
03414 beg = rb_str_sublen(str, beg);
03415 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03416 return val;
03417
03418 default:
03419
03420 {
03421 long beg, len;
03422 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03423 rb_str_splice(str, beg, len, val);
03424 return val;
03425 }
03426 }
03427 idx = NUM2LONG(indx);
03428 goto num_index;
03429 }
03430 }
03431
03432
03433
03434
03435
03436
03437
03438
03439
03440
03441
03442
03443
03444
03445
03446
03447
03448
03449
03450
03451
03452
03453
03454
03455
03456
03457 static VALUE
03458 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03459 {
03460 if (argc == 3) {
03461 if (TYPE(argv[0]) == T_REGEXP) {
03462 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03463 }
03464 else {
03465 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03466 }
03467 return argv[2];
03468 }
03469 if (argc != 2) {
03470 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03471 }
03472 return rb_str_aset(str, argv[0], argv[1]);
03473 }
03474
03475
03476
03477
03478
03479
03480
03481
03482
03483
03484
03485
03486
03487
03488
03489
03490
03491
03492 static VALUE
03493 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03494 {
03495 long pos = NUM2LONG(idx);
03496
03497 if (pos == -1) {
03498 return rb_str_append(str, str2);
03499 }
03500 else if (pos < 0) {
03501 pos++;
03502 }
03503 rb_str_splice(str, pos, 0, str2);
03504 return str;
03505 }
03506
03507
03508
03509
03510
03511
03512
03513
03514
03515
03516
03517
03518
03519
03520
03521
03522
03523
03524
03525
03526
03527 static VALUE
03528 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03529 {
03530 VALUE result;
03531 VALUE buf[3];
03532 int i;
03533
03534 if (argc < 1 || 2 < argc) {
03535 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03536 }
03537 for (i=0; i<argc; i++) {
03538 buf[i] = argv[i];
03539 }
03540 str_modify_keep_cr(str);
03541 result = rb_str_aref_m(argc, buf, str);
03542 if (!NIL_P(result)) {
03543 buf[i] = rb_str_new(0,0);
03544 rb_str_aset_m(argc+1, buf, str);
03545 }
03546 return result;
03547 }
03548
03549 static VALUE
03550 get_pat(VALUE pat, int quote)
03551 {
03552 VALUE val;
03553
03554 switch (TYPE(pat)) {
03555 case T_REGEXP:
03556 return pat;
03557
03558 case T_STRING:
03559 break;
03560
03561 default:
03562 val = rb_check_string_type(pat);
03563 if (NIL_P(val)) {
03564 Check_Type(pat, T_REGEXP);
03565 }
03566 pat = val;
03567 }
03568
03569 if (quote) {
03570 pat = rb_reg_quote(pat);
03571 }
03572
03573 return rb_reg_regcomp(pat);
03574 }
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587 static VALUE
03588 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03589 {
03590 VALUE pat, repl, hash = Qnil;
03591 int iter = 0;
03592 int tainted = 0;
03593 int untrusted = 0;
03594 long plen;
03595
03596 if (argc == 1 && rb_block_given_p()) {
03597 iter = 1;
03598 }
03599 else if (argc == 2) {
03600 repl = argv[1];
03601 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03602 if (NIL_P(hash)) {
03603 StringValue(repl);
03604 }
03605 if (OBJ_TAINTED(repl)) tainted = 1;
03606 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03607 }
03608 else {
03609 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03610 }
03611
03612 pat = get_pat(argv[0], 1);
03613 str_modifiable(str);
03614 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03615 rb_encoding *enc;
03616 int cr = ENC_CODERANGE(str);
03617 VALUE match = rb_backref_get();
03618 struct re_registers *regs = RMATCH_REGS(match);
03619 long beg0 = BEG(0);
03620 long end0 = END(0);
03621 char *p, *rp;
03622 long len, rlen;
03623
03624 if (iter || !NIL_P(hash)) {
03625 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03626
03627 if (iter) {
03628 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03629 }
03630 else {
03631 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03632 repl = rb_obj_as_string(repl);
03633 }
03634 str_mod_check(str, p, len);
03635 rb_check_frozen(str);
03636 }
03637 else {
03638 repl = rb_reg_regsub(repl, str, regs, pat);
03639 }
03640 enc = rb_enc_compatible(str, repl);
03641 if (!enc) {
03642 rb_encoding *str_enc = STR_ENC_GET(str);
03643 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03644 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03645 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03646 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03647 rb_enc_name(str_enc),
03648 rb_enc_name(STR_ENC_GET(repl)));
03649 }
03650 enc = STR_ENC_GET(repl);
03651 }
03652 rb_str_modify(str);
03653 rb_enc_associate(str, enc);
03654 if (OBJ_TAINTED(repl)) tainted = 1;
03655 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03656 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03657 int cr2 = ENC_CODERANGE(repl);
03658 if (cr2 == ENC_CODERANGE_BROKEN ||
03659 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03660 cr = ENC_CODERANGE_UNKNOWN;
03661 else
03662 cr = cr2;
03663 }
03664 plen = end0 - beg0;
03665 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03666 len = RSTRING_LEN(str);
03667 if (rlen > plen) {
03668 RESIZE_CAPA(str, len + rlen - plen);
03669 }
03670 p = RSTRING_PTR(str);
03671 if (rlen != plen) {
03672 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03673 }
03674 memcpy(p + beg0, rp, rlen);
03675 len += rlen - plen;
03676 STR_SET_LEN(str, len);
03677 RSTRING_PTR(str)[len] = '\0';
03678 ENC_CODERANGE_SET(str, cr);
03679 if (tainted) OBJ_TAINT(str);
03680 if (untrusted) OBJ_UNTRUST(str);
03681
03682 return str;
03683 }
03684 return Qnil;
03685 }
03686
03687
03688
03689
03690
03691
03692
03693
03694
03695
03696
03697
03698
03699
03700
03701
03702
03703
03704
03705
03706
03707
03708
03709
03710
03711
03712
03713
03714
03715
03716
03717
03718
03719
03720
03721
03722
03723
03724
03725
03726
03727
03728 static VALUE
03729 rb_str_sub(int argc, VALUE *argv, VALUE str)
03730 {
03731 str = rb_str_dup(str);
03732 rb_str_sub_bang(argc, argv, str);
03733 return str;
03734 }
03735
03736 static VALUE
03737 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03738 {
03739 VALUE pat, val, repl, match, dest, hash = Qnil;
03740 struct re_registers *regs;
03741 long beg, n;
03742 long beg0, end0;
03743 long offset, blen, slen, len, last;
03744 int iter = 0;
03745 char *sp, *cp;
03746 int tainted = 0;
03747 rb_encoding *str_enc;
03748
03749 switch (argc) {
03750 case 1:
03751 RETURN_ENUMERATOR(str, argc, argv);
03752 iter = 1;
03753 break;
03754 case 2:
03755 repl = argv[1];
03756 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03757 if (NIL_P(hash)) {
03758 StringValue(repl);
03759 }
03760 if (OBJ_TAINTED(repl)) tainted = 1;
03761 break;
03762 default:
03763 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03764 }
03765
03766 pat = get_pat(argv[0], 1);
03767 beg = rb_reg_search(pat, str, 0, 0);
03768 if (beg < 0) {
03769 if (bang) return Qnil;
03770 return rb_str_dup(str);
03771 }
03772
03773 offset = 0;
03774 n = 0;
03775 blen = RSTRING_LEN(str) + 30;
03776 dest = rb_str_buf_new(blen);
03777 sp = RSTRING_PTR(str);
03778 slen = RSTRING_LEN(str);
03779 cp = sp;
03780 str_enc = STR_ENC_GET(str);
03781 rb_enc_associate(dest, str_enc);
03782 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03783
03784 do {
03785 n++;
03786 match = rb_backref_get();
03787 regs = RMATCH_REGS(match);
03788 beg0 = BEG(0);
03789 end0 = END(0);
03790 if (iter || !NIL_P(hash)) {
03791 if (iter) {
03792 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03793 }
03794 else {
03795 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03796 val = rb_obj_as_string(val);
03797 }
03798 str_mod_check(str, sp, slen);
03799 if (val == dest) {
03800 rb_raise(rb_eRuntimeError, "block should not cheat");
03801 }
03802 }
03803 else {
03804 val = rb_reg_regsub(repl, str, regs, pat);
03805 }
03806
03807 if (OBJ_TAINTED(val)) tainted = 1;
03808
03809 len = beg - offset;
03810 if (len) {
03811 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03812 }
03813
03814 rb_str_buf_append(dest, val);
03815
03816 last = offset;
03817 offset = end0;
03818 if (beg0 == end0) {
03819
03820
03821
03822
03823 if (RSTRING_LEN(str) <= end0) break;
03824 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03825 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03826 offset = end0 + len;
03827 }
03828 cp = RSTRING_PTR(str) + offset;
03829 if (offset > RSTRING_LEN(str)) break;
03830 beg = rb_reg_search(pat, str, offset, 0);
03831 } while (beg >= 0);
03832 if (RSTRING_LEN(str) > offset) {
03833 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03834 }
03835 rb_reg_search(pat, str, last, 0);
03836 if (bang) {
03837 rb_str_shared_replace(str, dest);
03838 }
03839 else {
03840 RBASIC(dest)->klass = rb_obj_class(str);
03841 OBJ_INFECT(dest, str);
03842 str = dest;
03843 }
03844
03845 if (tainted) OBJ_TAINT(str);
03846 return str;
03847 }
03848
03849
03850
03851
03852
03853
03854
03855
03856
03857
03858
03859
03860
03861 static VALUE
03862 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03863 {
03864 str_modify_keep_cr(str);
03865 return str_gsub(argc, argv, str, 1);
03866 }
03867
03868
03869
03870
03871
03872
03873
03874
03875
03876
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887
03888
03889
03890
03891
03892
03893
03894
03895
03896
03897
03898
03899
03900
03901
03902
03903
03904
03905
03906
03907
03908
03909
03910
03911
03912 static VALUE
03913 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03914 {
03915 return str_gsub(argc, argv, str, 0);
03916 }
03917
03918
03919
03920
03921
03922
03923
03924
03925
03926
03927
03928
03929
03930 VALUE
03931 rb_str_replace(VALUE str, VALUE str2)
03932 {
03933 str_modifiable(str);
03934 if (str == str2) return str;
03935
03936 StringValue(str2);
03937 str_discard(str);
03938 return str_replace(str, str2);
03939 }
03940
03941
03942
03943
03944
03945
03946
03947
03948
03949
03950
03951 static VALUE
03952 rb_str_clear(VALUE str)
03953 {
03954 str_discard(str);
03955 STR_SET_EMBED(str);
03956 STR_SET_EMBED_LEN(str, 0);
03957 RSTRING_PTR(str)[0] = 0;
03958 if (rb_enc_asciicompat(STR_ENC_GET(str)))
03959 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03960 else
03961 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03962 return str;
03963 }
03964
03965
03966
03967
03968
03969
03970
03971
03972
03973
03974
03975 static VALUE
03976 rb_str_chr(VALUE str)
03977 {
03978 return rb_str_substr(str, 0, 1);
03979 }
03980
03981
03982
03983
03984
03985
03986
03987 static VALUE
03988 rb_str_getbyte(VALUE str, VALUE index)
03989 {
03990 long pos = NUM2LONG(index);
03991
03992 if (pos < 0)
03993 pos += RSTRING_LEN(str);
03994 if (pos < 0 || RSTRING_LEN(str) <= pos)
03995 return Qnil;
03996
03997 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03998 }
03999
04000
04001
04002
04003
04004
04005
04006 static VALUE
04007 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
04008 {
04009 long pos = NUM2LONG(index);
04010 int byte = NUM2INT(value);
04011
04012 rb_str_modify(str);
04013
04014 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04015 rb_raise(rb_eIndexError, "index %ld out of string", pos);
04016 if (pos < 0)
04017 pos += RSTRING_LEN(str);
04018
04019 RSTRING_PTR(str)[pos] = byte;
04020
04021 return value;
04022 }
04023
04024 static VALUE
04025 str_byte_substr(VALUE str, long beg, long len)
04026 {
04027 char *p, *s = RSTRING_PTR(str);
04028 long n = RSTRING_LEN(str);
04029 VALUE str2;
04030
04031 if (beg > n || len < 0) return Qnil;
04032 if (beg < 0) {
04033 beg += n;
04034 if (beg < 0) return Qnil;
04035 }
04036 if (beg + len > n)
04037 len = n - beg;
04038 if (len <= 0) {
04039 len = 0;
04040 p = 0;
04041 }
04042 else
04043 p = s + beg;
04044
04045 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04046 str2 = rb_str_new4(str);
04047 str2 = str_new3(rb_obj_class(str2), str2);
04048 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04049 RSTRING(str2)->as.heap.len = len;
04050 }
04051 else {
04052 str2 = rb_str_new5(str, p, len);
04053 rb_enc_cr_str_copy_for_substr(str2, str);
04054 OBJ_INFECT(str2, str);
04055 }
04056
04057 return str2;
04058 }
04059
04060 static VALUE
04061 str_byte_aref(VALUE str, VALUE indx)
04062 {
04063 long idx;
04064 switch (TYPE(indx)) {
04065 case T_FIXNUM:
04066 idx = FIX2LONG(indx);
04067
04068 num_index:
04069 str = str_byte_substr(str, idx, 1);
04070 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04071 return str;
04072
04073 default:
04074
04075 {
04076 long beg, len = RSTRING_LEN(str);
04077
04078 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04079 case Qfalse:
04080 break;
04081 case Qnil:
04082 return Qnil;
04083 default:
04084 return str_byte_substr(str, beg, len);
04085 }
04086 }
04087 idx = NUM2LONG(indx);
04088 goto num_index;
04089 }
04090 return Qnil;
04091 }
04092
04093
04094
04095
04096
04097
04098
04099
04100
04101
04102
04103
04104
04105
04106
04107
04108
04109
04110
04111
04112
04113
04114
04115
04116 static VALUE
04117 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04118 {
04119 if (argc == 2) {
04120 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04121 }
04122 if (argc != 1) {
04123 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
04124 }
04125 return str_byte_aref(str, argv[0]);
04126 }
04127
04128
04129
04130
04131
04132
04133
04134
04135
04136
04137 static VALUE
04138 rb_str_reverse(VALUE str)
04139 {
04140 rb_encoding *enc;
04141 VALUE rev;
04142 char *s, *e, *p;
04143 int single = 1;
04144
04145 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04146 enc = STR_ENC_GET(str);
04147 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04148 s = RSTRING_PTR(str); e = RSTRING_END(str);
04149 p = RSTRING_END(rev);
04150
04151 if (RSTRING_LEN(str) > 1) {
04152 if (single_byte_optimizable(str)) {
04153 while (s < e) {
04154 *--p = *s++;
04155 }
04156 }
04157 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04158 while (s < e) {
04159 int clen = rb_enc_fast_mbclen(s, e, enc);
04160
04161 if (clen > 1 || (*s & 0x80)) single = 0;
04162 p -= clen;
04163 memcpy(p, s, clen);
04164 s += clen;
04165 }
04166 }
04167 else {
04168 while (s < e) {
04169 int clen = rb_enc_mbclen(s, e, enc);
04170
04171 if (clen > 1 || (*s & 0x80)) single = 0;
04172 p -= clen;
04173 memcpy(p, s, clen);
04174 s += clen;
04175 }
04176 }
04177 }
04178 STR_SET_LEN(rev, RSTRING_LEN(str));
04179 OBJ_INFECT(rev, str);
04180 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04181 if (single) {
04182 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04183 }
04184 else {
04185 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04186 }
04187 }
04188 rb_enc_cr_str_copy_for_substr(rev, str);
04189
04190 return rev;
04191 }
04192
04193
04194
04195
04196
04197
04198
04199
04200
04201 static VALUE
04202 rb_str_reverse_bang(VALUE str)
04203 {
04204 if (RSTRING_LEN(str) > 1) {
04205 if (single_byte_optimizable(str)) {
04206 char *s, *e, c;
04207
04208 str_modify_keep_cr(str);
04209 s = RSTRING_PTR(str);
04210 e = RSTRING_END(str) - 1;
04211 while (s < e) {
04212 c = *s;
04213 *s++ = *e;
04214 *e-- = c;
04215 }
04216 }
04217 else {
04218 rb_str_shared_replace(str, rb_str_reverse(str));
04219 }
04220 }
04221 else {
04222 str_modify_keep_cr(str);
04223 }
04224 return str;
04225 }
04226
04227
04228
04229
04230
04231
04232
04233
04234
04235
04236
04237
04238
04239
04240 static VALUE
04241 rb_str_include(VALUE str, VALUE arg)
04242 {
04243 long i;
04244
04245 StringValue(arg);
04246 i = rb_str_index(str, arg, 0);
04247
04248 if (i == -1) return Qfalse;
04249 return Qtrue;
04250 }
04251
04252
04253
04254
04255
04256
04257
04258
04259
04260
04261
04262
04263
04264
04265
04266
04267
04268
04269
04270
04271
04272
04273
04274 static VALUE
04275 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04276 {
04277 int base;
04278
04279 if (argc == 0) base = 10;
04280 else {
04281 VALUE b;
04282
04283 rb_scan_args(argc, argv, "01", &b);
04284 base = NUM2INT(b);
04285 }
04286 if (base < 0) {
04287 rb_raise(rb_eArgError, "invalid radix %d", base);
04288 }
04289 return rb_str_to_inum(str, base, FALSE);
04290 }
04291
04292
04293
04294
04295
04296
04297
04298
04299
04300
04301
04302
04303
04304
04305
04306
04307 static VALUE
04308 rb_str_to_f(VALUE str)
04309 {
04310 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04311 }
04312
04313
04314
04315
04316
04317
04318
04319
04320
04321
04322 static VALUE
04323 rb_str_to_s(VALUE str)
04324 {
04325 if (rb_obj_class(str) != rb_cString) {
04326 return str_duplicate(rb_cString, str);
04327 }
04328 return str;
04329 }
04330
04331 #if 0
04332 static void
04333 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04334 {
04335 char s[RUBY_MAX_CHAR_LEN];
04336 int n = rb_enc_codelen(c, enc);
04337
04338 rb_enc_mbcput(c, s, enc);
04339 rb_enc_str_buf_cat(str, s, n, enc);
04340 }
04341 #endif
04342
04343 #define CHAR_ESC_LEN 13
04344
04345 int
04346 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04347 {
04348 char buf[CHAR_ESC_LEN + 1];
04349 int l;
04350
04351 #if SIZEOF_INT > 4
04352 c &= 0xffffffff;
04353 #endif
04354 if (unicode_p) {
04355 if (c < 0x7F && ISPRINT(c)) {
04356 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04357 }
04358 else if (c < 0x10000) {
04359 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04360 }
04361 else {
04362 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04363 }
04364 }
04365 else {
04366 if (c < 0x100) {
04367 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04368 }
04369 else {
04370 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04371 }
04372 }
04373 l = (int)strlen(buf);
04374 rb_str_buf_cat(result, buf, l);
04375 return l;
04376 }
04377
04378
04379
04380
04381
04382
04383
04384
04385
04386
04387
04388
04389
04390 VALUE
04391 rb_str_inspect(VALUE str)
04392 {
04393 rb_encoding *enc = STR_ENC_GET(str);
04394 const char *p, *pend, *prev;
04395 char buf[CHAR_ESC_LEN + 1];
04396 VALUE result = rb_str_buf_new(0);
04397 rb_encoding *resenc = rb_default_internal_encoding();
04398 int unicode_p = rb_enc_unicode_p(enc);
04399 int asciicompat = rb_enc_asciicompat(enc);
04400 static rb_encoding *utf16, *utf32;
04401
04402 if (!utf16) utf16 = rb_enc_find("UTF-16");
04403 if (!utf32) utf32 = rb_enc_find("UTF-32");
04404 if (resenc == NULL) resenc = rb_default_external_encoding();
04405 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04406 rb_enc_associate(result, resenc);
04407 str_buf_cat2(result, "\"");
04408
04409 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04410 prev = p;
04411 if (enc == utf16) {
04412 const unsigned char *q = (const unsigned char *)p;
04413 if (q[0] == 0xFE && q[1] == 0xFF)
04414 enc = rb_enc_find("UTF-16BE");
04415 else if (q[0] == 0xFF && q[1] == 0xFE)
04416 enc = rb_enc_find("UTF-16LE");
04417 else
04418 unicode_p = 0;
04419 }
04420 else if (enc == utf32) {
04421 const unsigned char *q = (const unsigned char *)p;
04422 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04423 enc = rb_enc_find("UTF-32BE");
04424 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04425 enc = rb_enc_find("UTF-32LE");
04426 else
04427 unicode_p = 0;
04428 }
04429 while (p < pend) {
04430 unsigned int c, cc;
04431 int n;
04432
04433 n = rb_enc_precise_mbclen(p, pend, enc);
04434 if (!MBCLEN_CHARFOUND_P(n)) {
04435 if (p > prev) str_buf_cat(result, prev, p - prev);
04436 n = rb_enc_mbminlen(enc);
04437 if (pend < p + n)
04438 n = (int)(pend - p);
04439 while (n--) {
04440 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04441 str_buf_cat(result, buf, strlen(buf));
04442 prev = ++p;
04443 }
04444 continue;
04445 }
04446 n = MBCLEN_CHARFOUND_LEN(n);
04447 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04448 p += n;
04449 if ((asciicompat || unicode_p) &&
04450 (c == '"'|| c == '\\' ||
04451 (c == '#' &&
04452 p < pend &&
04453 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04454 (cc = rb_enc_codepoint(p,pend,enc),
04455 (cc == '$' || cc == '@' || cc == '{'))))) {
04456 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04457 str_buf_cat2(result, "\\");
04458 if (asciicompat || enc == resenc) {
04459 prev = p - n;
04460 continue;
04461 }
04462 }
04463 switch (c) {
04464 case '\n': cc = 'n'; break;
04465 case '\r': cc = 'r'; break;
04466 case '\t': cc = 't'; break;
04467 case '\f': cc = 'f'; break;
04468 case '\013': cc = 'v'; break;
04469 case '\010': cc = 'b'; break;
04470 case '\007': cc = 'a'; break;
04471 case 033: cc = 'e'; break;
04472 default: cc = 0; break;
04473 }
04474 if (cc) {
04475 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04476 buf[0] = '\\';
04477 buf[1] = (char)cc;
04478 str_buf_cat(result, buf, 2);
04479 prev = p;
04480 continue;
04481 }
04482 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04483 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04484 continue;
04485 }
04486 else {
04487 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04488 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04489 prev = p;
04490 continue;
04491 }
04492 }
04493 if (p > prev) str_buf_cat(result, prev, p - prev);
04494 str_buf_cat2(result, "\"");
04495
04496 OBJ_INFECT(result, str);
04497 return result;
04498 }
04499
04500 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04501
04502
04503
04504
04505
04506
04507
04508
04509
04510 VALUE
04511 rb_str_dump(VALUE str)
04512 {
04513 rb_encoding *enc = rb_enc_get(str);
04514 long len;
04515 const char *p, *pend;
04516 char *q, *qend;
04517 VALUE result;
04518 int u8 = (enc == rb_utf8_encoding());
04519
04520 len = 2;
04521 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04522 while (p < pend) {
04523 unsigned char c = *p++;
04524 switch (c) {
04525 case '"': case '\\':
04526 case '\n': case '\r':
04527 case '\t': case '\f':
04528 case '\013': case '\010': case '\007': case '\033':
04529 len += 2;
04530 break;
04531
04532 case '#':
04533 len += IS_EVSTR(p, pend) ? 2 : 1;
04534 break;
04535
04536 default:
04537 if (ISPRINT(c)) {
04538 len++;
04539 }
04540 else {
04541 if (u8) {
04542 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04543 if (MBCLEN_CHARFOUND_P(n-1)) {
04544 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04545 while (cc >>= 4) len++;
04546 len += 5;
04547 p += MBCLEN_CHARFOUND_LEN(n)-1;
04548 break;
04549 }
04550 }
04551 len += 4;
04552 }
04553 break;
04554 }
04555 }
04556 if (!rb_enc_asciicompat(enc)) {
04557 len += 19;
04558 len += strlen(enc->name);
04559 }
04560
04561 result = rb_str_new5(str, 0, len);
04562 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04563 q = RSTRING_PTR(result); qend = q + len + 1;
04564
04565 *q++ = '"';
04566 while (p < pend) {
04567 unsigned char c = *p++;
04568
04569 if (c == '"' || c == '\\') {
04570 *q++ = '\\';
04571 *q++ = c;
04572 }
04573 else if (c == '#') {
04574 if (IS_EVSTR(p, pend)) *q++ = '\\';
04575 *q++ = '#';
04576 }
04577 else if (c == '\n') {
04578 *q++ = '\\';
04579 *q++ = 'n';
04580 }
04581 else if (c == '\r') {
04582 *q++ = '\\';
04583 *q++ = 'r';
04584 }
04585 else if (c == '\t') {
04586 *q++ = '\\';
04587 *q++ = 't';
04588 }
04589 else if (c == '\f') {
04590 *q++ = '\\';
04591 *q++ = 'f';
04592 }
04593 else if (c == '\013') {
04594 *q++ = '\\';
04595 *q++ = 'v';
04596 }
04597 else if (c == '\010') {
04598 *q++ = '\\';
04599 *q++ = 'b';
04600 }
04601 else if (c == '\007') {
04602 *q++ = '\\';
04603 *q++ = 'a';
04604 }
04605 else if (c == '\033') {
04606 *q++ = '\\';
04607 *q++ = 'e';
04608 }
04609 else if (ISPRINT(c)) {
04610 *q++ = c;
04611 }
04612 else {
04613 *q++ = '\\';
04614 if (u8) {
04615 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04616 if (MBCLEN_CHARFOUND_P(n)) {
04617 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04618 p += n;
04619 snprintf(q, qend-q, "u{%x}", cc);
04620 q += strlen(q);
04621 continue;
04622 }
04623 }
04624 snprintf(q, qend-q, "x%02X", c);
04625 q += 3;
04626 }
04627 }
04628 *q++ = '"';
04629 *q = '\0';
04630 if (!rb_enc_asciicompat(enc)) {
04631 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04632 enc = rb_ascii8bit_encoding();
04633 }
04634 OBJ_INFECT(result, str);
04635
04636 rb_enc_associate(result, enc);
04637 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04638 return result;
04639 }
04640
04641
04642 static void
04643 rb_str_check_dummy_enc(rb_encoding *enc)
04644 {
04645 if (rb_enc_dummy_p(enc)) {
04646 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04647 rb_enc_name(enc));
04648 }
04649 }
04650
04651
04652
04653
04654
04655
04656
04657
04658
04659
04660 static VALUE
04661 rb_str_upcase_bang(VALUE str)
04662 {
04663 rb_encoding *enc;
04664 char *s, *send;
04665 int modify = 0;
04666 int n;
04667
04668 str_modify_keep_cr(str);
04669 enc = STR_ENC_GET(str);
04670 rb_str_check_dummy_enc(enc);
04671 s = RSTRING_PTR(str); send = RSTRING_END(str);
04672 if (single_byte_optimizable(str)) {
04673 while (s < send) {
04674 unsigned int c = *(unsigned char*)s;
04675
04676 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04677 *s = 'A' + (c - 'a');
04678 modify = 1;
04679 }
04680 s++;
04681 }
04682 }
04683 else {
04684 int ascompat = rb_enc_asciicompat(enc);
04685
04686 while (s < send) {
04687 unsigned int c;
04688
04689 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04690 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04691 *s = 'A' + (c - 'a');
04692 modify = 1;
04693 }
04694 s++;
04695 }
04696 else {
04697 c = rb_enc_codepoint_len(s, send, &n, enc);
04698 if (rb_enc_islower(c, enc)) {
04699
04700 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04701 modify = 1;
04702 }
04703 s += n;
04704 }
04705 }
04706 }
04707
04708 if (modify) return str;
04709 return Qnil;
04710 }
04711
04712
04713
04714
04715
04716
04717
04718
04719
04720
04721
04722
04723
04724
04725 static VALUE
04726 rb_str_upcase(VALUE str)
04727 {
04728 str = rb_str_dup(str);
04729 rb_str_upcase_bang(str);
04730 return str;
04731 }
04732
04733
04734
04735
04736
04737
04738
04739
04740
04741
04742
04743 static VALUE
04744 rb_str_downcase_bang(VALUE str)
04745 {
04746 rb_encoding *enc;
04747 char *s, *send;
04748 int modify = 0;
04749
04750 str_modify_keep_cr(str);
04751 enc = STR_ENC_GET(str);
04752 rb_str_check_dummy_enc(enc);
04753 s = RSTRING_PTR(str); send = RSTRING_END(str);
04754 if (single_byte_optimizable(str)) {
04755 while (s < send) {
04756 unsigned int c = *(unsigned char*)s;
04757
04758 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04759 *s = 'a' + (c - 'A');
04760 modify = 1;
04761 }
04762 s++;
04763 }
04764 }
04765 else {
04766 int ascompat = rb_enc_asciicompat(enc);
04767
04768 while (s < send) {
04769 unsigned int c;
04770 int n;
04771
04772 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04773 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04774 *s = 'a' + (c - 'A');
04775 modify = 1;
04776 }
04777 s++;
04778 }
04779 else {
04780 c = rb_enc_codepoint_len(s, send, &n, enc);
04781 if (rb_enc_isupper(c, enc)) {
04782
04783 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04784 modify = 1;
04785 }
04786 s += n;
04787 }
04788 }
04789 }
04790
04791 if (modify) return str;
04792 return Qnil;
04793 }
04794
04795
04796
04797
04798
04799
04800
04801
04802
04803
04804
04805
04806
04807
04808 static VALUE
04809 rb_str_downcase(VALUE str)
04810 {
04811 str = rb_str_dup(str);
04812 rb_str_downcase_bang(str);
04813 return str;
04814 }
04815
04816
04817
04818
04819
04820
04821
04822
04823
04824
04825
04826
04827
04828
04829
04830
04831 static VALUE
04832 rb_str_capitalize_bang(VALUE str)
04833 {
04834 rb_encoding *enc;
04835 char *s, *send;
04836 int modify = 0;
04837 unsigned int c;
04838 int n;
04839
04840 str_modify_keep_cr(str);
04841 enc = STR_ENC_GET(str);
04842 rb_str_check_dummy_enc(enc);
04843 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04844 s = RSTRING_PTR(str); send = RSTRING_END(str);
04845
04846 c = rb_enc_codepoint_len(s, send, &n, enc);
04847 if (rb_enc_islower(c, enc)) {
04848 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04849 modify = 1;
04850 }
04851 s += n;
04852 while (s < send) {
04853 c = rb_enc_codepoint_len(s, send, &n, enc);
04854 if (rb_enc_isupper(c, enc)) {
04855 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04856 modify = 1;
04857 }
04858 s += n;
04859 }
04860
04861 if (modify) return str;
04862 return Qnil;
04863 }
04864
04865
04866
04867
04868
04869
04870
04871
04872
04873
04874
04875
04876
04877
04878
04879 static VALUE
04880 rb_str_capitalize(VALUE str)
04881 {
04882 str = rb_str_dup(str);
04883 rb_str_capitalize_bang(str);
04884 return str;
04885 }
04886
04887
04888
04889
04890
04891
04892
04893
04894
04895
04896
04897 static VALUE
04898 rb_str_swapcase_bang(VALUE str)
04899 {
04900 rb_encoding *enc;
04901 char *s, *send;
04902 int modify = 0;
04903 int n;
04904
04905 str_modify_keep_cr(str);
04906 enc = STR_ENC_GET(str);
04907 rb_str_check_dummy_enc(enc);
04908 s = RSTRING_PTR(str); send = RSTRING_END(str);
04909 while (s < send) {
04910 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04911
04912 if (rb_enc_isupper(c, enc)) {
04913
04914 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04915 modify = 1;
04916 }
04917 else if (rb_enc_islower(c, enc)) {
04918
04919 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04920 modify = 1;
04921 }
04922 s += n;
04923 }
04924
04925 if (modify) return str;
04926 return Qnil;
04927 }
04928
04929
04930
04931
04932
04933
04934
04935
04936
04937
04938
04939
04940
04941
04942 static VALUE
04943 rb_str_swapcase(VALUE str)
04944 {
04945 str = rb_str_dup(str);
04946 rb_str_swapcase_bang(str);
04947 return str;
04948 }
04949
04950 typedef unsigned char *USTR;
04951
04952 struct tr {
04953 int gen;
04954 unsigned int now, max;
04955 char *p, *pend;
04956 };
04957
04958 static unsigned int
04959 trnext(struct tr *t, rb_encoding *enc)
04960 {
04961 int n;
04962
04963 for (;;) {
04964 if (!t->gen) {
04965 if (t->p == t->pend) return -1;
04966 if (t->p < t->pend - 1 && *t->p == '\\') {
04967 t->p++;
04968 }
04969 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04970 t->p += n;
04971 if (t->p < t->pend - 1 && *t->p == '-') {
04972 t->p++;
04973 if (t->p < t->pend) {
04974 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04975 t->p += n;
04976 if (t->now > c) {
04977 if (t->now < 0x80 && c < 0x80) {
04978 rb_raise(rb_eArgError,
04979 "invalid range \"%c-%c\" in string transliteration",
04980 t->now, c);
04981 }
04982 else {
04983 rb_raise(rb_eArgError, "invalid range in string transliteration");
04984 }
04985 continue;
04986 }
04987 t->gen = 1;
04988 t->max = c;
04989 }
04990 }
04991 return t->now;
04992 }
04993 else if (++t->now < t->max) {
04994 return t->now;
04995 }
04996 else {
04997 t->gen = 0;
04998 return t->max;
04999 }
05000 }
05001 }
05002
05003 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
05004
05005 static VALUE
05006 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
05007 {
05008 const unsigned int errc = -1;
05009 unsigned int trans[256];
05010 rb_encoding *enc, *e1, *e2;
05011 struct tr trsrc, trrepl;
05012 int cflag = 0;
05013 unsigned int c, c0, last = 0;
05014 int modify = 0, i, l;
05015 char *s, *send;
05016 VALUE hash = 0;
05017 int singlebyte = single_byte_optimizable(str);
05018 int cr;
05019
05020 #define CHECK_IF_ASCII(c) \
05021 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05022 (cr = ENC_CODERANGE_VALID) : 0)
05023
05024 StringValue(src);
05025 StringValue(repl);
05026 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05027 if (RSTRING_LEN(repl) == 0) {
05028 return rb_str_delete_bang(1, &src, str);
05029 }
05030
05031 cr = ENC_CODERANGE(str);
05032 e1 = rb_enc_check(str, src);
05033 e2 = rb_enc_check(str, repl);
05034 if (e1 == e2) {
05035 enc = e1;
05036 }
05037 else {
05038 enc = rb_enc_check(src, repl);
05039 }
05040 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05041 if (RSTRING_LEN(src) > 1 &&
05042 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05043 trsrc.p + l < trsrc.pend) {
05044 cflag = 1;
05045 trsrc.p += l;
05046 }
05047 trrepl.p = RSTRING_PTR(repl);
05048 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05049 trsrc.gen = trrepl.gen = 0;
05050 trsrc.now = trrepl.now = 0;
05051 trsrc.max = trrepl.max = 0;
05052
05053 if (cflag) {
05054 for (i=0; i<256; i++) {
05055 trans[i] = 1;
05056 }
05057 while ((c = trnext(&trsrc, enc)) != errc) {
05058 if (c < 256) {
05059 trans[c] = errc;
05060 }
05061 else {
05062 if (!hash) hash = rb_hash_new();
05063 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05064 }
05065 }
05066 while ((c = trnext(&trrepl, enc)) != errc)
05067 ;
05068 last = trrepl.now;
05069 for (i=0; i<256; i++) {
05070 if (trans[i] != errc) {
05071 trans[i] = last;
05072 }
05073 }
05074 }
05075 else {
05076 unsigned int r;
05077
05078 for (i=0; i<256; i++) {
05079 trans[i] = errc;
05080 }
05081 while ((c = trnext(&trsrc, enc)) != errc) {
05082 r = trnext(&trrepl, enc);
05083 if (r == errc) r = trrepl.now;
05084 if (c < 256) {
05085 trans[c] = r;
05086 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05087 }
05088 else {
05089 if (!hash) hash = rb_hash_new();
05090 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05091 }
05092 }
05093 }
05094
05095 if (cr == ENC_CODERANGE_VALID)
05096 cr = ENC_CODERANGE_7BIT;
05097 str_modify_keep_cr(str);
05098 s = RSTRING_PTR(str); send = RSTRING_END(str);
05099 if (sflag) {
05100 int clen, tlen;
05101 long offset, max = RSTRING_LEN(str);
05102 unsigned int save = -1;
05103 char *buf = ALLOC_N(char, max), *t = buf;
05104
05105 while (s < send) {
05106 int may_modify = 0;
05107
05108 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05109 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05110
05111 s += clen;
05112 if (c < 256) {
05113 c = trans[c];
05114 }
05115 else if (hash) {
05116 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05117 if (NIL_P(tmp)) {
05118 if (cflag) c = last;
05119 else c = errc;
05120 }
05121 else if (cflag) c = errc;
05122 else c = NUM2INT(tmp);
05123 }
05124 else {
05125 c = errc;
05126 }
05127 if (c != (unsigned int)-1) {
05128 if (save == c) {
05129 CHECK_IF_ASCII(c);
05130 continue;
05131 }
05132 save = c;
05133 tlen = rb_enc_codelen(c, enc);
05134 modify = 1;
05135 }
05136 else {
05137 save = -1;
05138 c = c0;
05139 if (enc != e1) may_modify = 1;
05140 }
05141 while (t - buf + tlen >= max) {
05142 offset = t - buf;
05143 max *= 2;
05144 REALLOC_N(buf, char, max);
05145 t = buf + offset;
05146 }
05147 rb_enc_mbcput(c, t, enc);
05148 if (may_modify && memcmp(s, t, tlen) != 0) {
05149 modify = 1;
05150 }
05151 CHECK_IF_ASCII(c);
05152 t += tlen;
05153 }
05154 if (!STR_EMBED_P(str)) {
05155 xfree(RSTRING(str)->as.heap.ptr);
05156 }
05157 *t = '\0';
05158 RSTRING(str)->as.heap.ptr = buf;
05159 RSTRING(str)->as.heap.len = t - buf;
05160 STR_SET_NOEMBED(str);
05161 RSTRING(str)->as.heap.aux.capa = max;
05162 }
05163 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05164 while (s < send) {
05165 c = (unsigned char)*s;
05166 if (trans[c] != errc) {
05167 if (!cflag) {
05168 c = trans[c];
05169 *s = c;
05170 modify = 1;
05171 }
05172 else {
05173 *s = last;
05174 modify = 1;
05175 }
05176 }
05177 CHECK_IF_ASCII(c);
05178 s++;
05179 }
05180 }
05181 else {
05182 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05183 long offset;
05184 char *buf = ALLOC_N(char, max), *t = buf;
05185
05186 while (s < send) {
05187 int may_modify = 0;
05188 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05189 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05190
05191 if (c < 256) {
05192 c = trans[c];
05193 }
05194 else if (hash) {
05195 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05196 if (NIL_P(tmp)) {
05197 if (cflag) c = last;
05198 else c = errc;
05199 }
05200 else if (cflag) c = errc;
05201 else c = NUM2INT(tmp);
05202 }
05203 else {
05204 c = cflag ? last : errc;
05205 }
05206 if (c != errc) {
05207 tlen = rb_enc_codelen(c, enc);
05208 modify = 1;
05209 }
05210 else {
05211 c = c0;
05212 if (enc != e1) may_modify = 1;
05213 }
05214 while (t - buf + tlen >= max) {
05215 offset = t - buf;
05216 max *= 2;
05217 REALLOC_N(buf, char, max);
05218 t = buf + offset;
05219 }
05220 if (s != t) {
05221 rb_enc_mbcput(c, t, enc);
05222 if (may_modify && memcmp(s, t, tlen) != 0) {
05223 modify = 1;
05224 }
05225 }
05226 CHECK_IF_ASCII(c);
05227 s += clen;
05228 t += tlen;
05229 }
05230 if (!STR_EMBED_P(str)) {
05231 xfree(RSTRING(str)->as.heap.ptr);
05232 }
05233 *t = '\0';
05234 RSTRING(str)->as.heap.ptr = buf;
05235 RSTRING(str)->as.heap.len = t - buf;
05236 STR_SET_NOEMBED(str);
05237 RSTRING(str)->as.heap.aux.capa = max;
05238 }
05239
05240 if (modify) {
05241 if (cr != ENC_CODERANGE_BROKEN)
05242 ENC_CODERANGE_SET(str, cr);
05243 rb_enc_associate(str, enc);
05244 return str;
05245 }
05246 return Qnil;
05247 }
05248
05249
05250
05251
05252
05253
05254
05255
05256
05257
05258
05259 static VALUE
05260 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05261 {
05262 return tr_trans(str, src, repl, 0);
05263 }
05264
05265
05266
05267
05268
05269
05270
05271
05272
05273
05274
05275
05276
05277
05278
05279
05280
05281
05282
05283
05284
05285
05286 static VALUE
05287 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05288 {
05289 str = rb_str_dup(str);
05290 tr_trans(str, src, repl, 0);
05291 return str;
05292 }
05293
05294 #define TR_TABLE_SIZE 257
05295 static void
05296 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05297 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05298 {
05299 const unsigned int errc = -1;
05300 char buf[256];
05301 struct tr tr;
05302 unsigned int c;
05303 VALUE table = 0, ptable = 0;
05304 int i, l, cflag = 0;
05305
05306 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05307 tr.gen = tr.now = tr.max = 0;
05308
05309 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05310 cflag = 1;
05311 tr.p += l;
05312 }
05313 if (first) {
05314 for (i=0; i<256; i++) {
05315 stable[i] = 1;
05316 }
05317 stable[256] = cflag;
05318 }
05319 else if (stable[256] && !cflag) {
05320 stable[256] = 0;
05321 }
05322 for (i=0; i<256; i++) {
05323 buf[i] = cflag;
05324 }
05325
05326 while ((c = trnext(&tr, enc)) != errc) {
05327 if (c < 256) {
05328 buf[c & 0xff] = !cflag;
05329 }
05330 else {
05331 VALUE key = UINT2NUM(c);
05332
05333 if (!table) {
05334 table = rb_hash_new();
05335 if (cflag) {
05336 ptable = *ctablep;
05337 *ctablep = table;
05338 }
05339 else {
05340 ptable = *tablep;
05341 *tablep = table;
05342 }
05343 }
05344 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05345 rb_hash_aset(table, key, Qtrue);
05346 }
05347 }
05348 }
05349 for (i=0; i<256; i++) {
05350 stable[i] = stable[i] && buf[i];
05351 }
05352 }
05353
05354
05355 static int
05356 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05357 {
05358 if (c < 256) {
05359 return table[c] != 0;
05360 }
05361 else {
05362 VALUE v = UINT2NUM(c);
05363
05364 if (del) {
05365 if (!NIL_P(rb_hash_lookup(del, v)) &&
05366 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05367 return TRUE;
05368 }
05369 }
05370 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05371 return FALSE;
05372 }
05373 return table[256] ? TRUE : FALSE;
05374 }
05375 }
05376
05377
05378
05379
05380
05381
05382
05383
05384
05385 static VALUE
05386 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05387 {
05388 char squeez[TR_TABLE_SIZE];
05389 rb_encoding *enc = 0;
05390 char *s, *send, *t;
05391 VALUE del = 0, nodel = 0;
05392 int modify = 0;
05393 int i, ascompat, cr;
05394
05395 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05396 if (argc < 1) {
05397 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05398 }
05399 for (i=0; i<argc; i++) {
05400 VALUE s = argv[i];
05401
05402 StringValue(s);
05403 enc = rb_enc_check(str, s);
05404 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05405 }
05406
05407 str_modify_keep_cr(str);
05408 ascompat = rb_enc_asciicompat(enc);
05409 s = t = RSTRING_PTR(str);
05410 send = RSTRING_END(str);
05411 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05412 while (s < send) {
05413 unsigned int c;
05414 int clen;
05415
05416 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05417 if (squeez[c]) {
05418 modify = 1;
05419 }
05420 else {
05421 if (t != s) *t = c;
05422 t++;
05423 }
05424 s++;
05425 }
05426 else {
05427 c = rb_enc_codepoint_len(s, send, &clen, enc);
05428
05429 if (tr_find(c, squeez, del, nodel)) {
05430 modify = 1;
05431 }
05432 else {
05433 if (t != s) rb_enc_mbcput(c, t, enc);
05434 t += clen;
05435 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05436 }
05437 s += clen;
05438 }
05439 }
05440 *t = '\0';
05441 STR_SET_LEN(str, t - RSTRING_PTR(str));
05442 ENC_CODERANGE_SET(str, cr);
05443
05444 if (modify) return str;
05445 return Qnil;
05446 }
05447
05448
05449
05450
05451
05452
05453
05454
05455
05456
05457
05458
05459
05460
05461
05462
05463 static VALUE
05464 rb_str_delete(int argc, VALUE *argv, VALUE str)
05465 {
05466 str = rb_str_dup(str);
05467 rb_str_delete_bang(argc, argv, str);
05468 return str;
05469 }
05470
05471
05472
05473
05474
05475
05476
05477
05478
05479
05480 static VALUE
05481 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05482 {
05483 char squeez[TR_TABLE_SIZE];
05484 rb_encoding *enc = 0;
05485 VALUE del = 0, nodel = 0;
05486 char *s, *send, *t;
05487 int i, modify = 0;
05488 int ascompat, singlebyte = single_byte_optimizable(str);
05489 unsigned int save;
05490
05491 if (argc == 0) {
05492 enc = STR_ENC_GET(str);
05493 }
05494 else {
05495 for (i=0; i<argc; i++) {
05496 VALUE s = argv[i];
05497
05498 StringValue(s);
05499 enc = rb_enc_check(str, s);
05500 if (singlebyte && !single_byte_optimizable(s))
05501 singlebyte = 0;
05502 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05503 }
05504 }
05505
05506 str_modify_keep_cr(str);
05507 s = t = RSTRING_PTR(str);
05508 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05509 send = RSTRING_END(str);
05510 save = -1;
05511 ascompat = rb_enc_asciicompat(enc);
05512
05513 if (singlebyte) {
05514 while (s < send) {
05515 unsigned int c = *(unsigned char*)s++;
05516 if (c != save || (argc > 0 && !squeez[c])) {
05517 *t++ = save = c;
05518 }
05519 }
05520 } else {
05521 while (s < send) {
05522 unsigned int c;
05523 int clen;
05524
05525 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05526 if (c != save || (argc > 0 && !squeez[c])) {
05527 *t++ = save = c;
05528 }
05529 s++;
05530 }
05531 else {
05532 c = rb_enc_codepoint_len(s, send, &clen, enc);
05533
05534 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05535 if (t != s) rb_enc_mbcput(c, t, enc);
05536 save = c;
05537 t += clen;
05538 }
05539 s += clen;
05540 }
05541 }
05542 }
05543
05544 *t = '\0';
05545 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05546 STR_SET_LEN(str, t - RSTRING_PTR(str));
05547 modify = 1;
05548 }
05549
05550 if (modify) return str;
05551 return Qnil;
05552 }
05553
05554
05555
05556
05557
05558
05559
05560
05561
05562
05563
05564
05565
05566
05567
05568
05569
05570 static VALUE
05571 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05572 {
05573 str = rb_str_dup(str);
05574 rb_str_squeeze_bang(argc, argv, str);
05575 return str;
05576 }
05577
05578
05579
05580
05581
05582
05583
05584
05585
05586
05587 static VALUE
05588 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05589 {
05590 return tr_trans(str, src, repl, 1);
05591 }
05592
05593
05594
05595
05596
05597
05598
05599
05600
05601
05602
05603
05604
05605
05606
05607 static VALUE
05608 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05609 {
05610 str = rb_str_dup(str);
05611 tr_trans(str, src, repl, 1);
05612 return str;
05613 }
05614
05615
05616
05617
05618
05619
05620
05621
05622
05623
05624
05625
05626
05627
05628
05629
05630
05631
05632 static VALUE
05633 rb_str_count(int argc, VALUE *argv, VALUE str)
05634 {
05635 char table[TR_TABLE_SIZE];
05636 rb_encoding *enc = 0;
05637 VALUE del = 0, nodel = 0;
05638 char *s, *send;
05639 int i;
05640 int ascompat;
05641
05642 if (argc < 1) {
05643 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05644 }
05645 for (i=0; i<argc; i++) {
05646 VALUE tstr = argv[i];
05647 unsigned char c;
05648
05649 StringValue(tstr);
05650 enc = rb_enc_check(str, tstr);
05651 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05652 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05653 int n = 0;
05654
05655 s = RSTRING_PTR(str);
05656 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05657 send = RSTRING_END(str);
05658 while (s < send) {
05659 if (*(unsigned char*)s++ == c) n++;
05660 }
05661 return INT2NUM(n);
05662 }
05663 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05664 }
05665
05666 s = RSTRING_PTR(str);
05667 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05668 send = RSTRING_END(str);
05669 ascompat = rb_enc_asciicompat(enc);
05670 i = 0;
05671 while (s < send) {
05672 unsigned int c;
05673
05674 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05675 if (table[c]) {
05676 i++;
05677 }
05678 s++;
05679 }
05680 else {
05681 int clen;
05682 c = rb_enc_codepoint_len(s, send, &clen, enc);
05683 if (tr_find(c, table, del, nodel)) {
05684 i++;
05685 }
05686 s += clen;
05687 }
05688 }
05689
05690 return INT2NUM(i);
05691 }
05692
05693 static const char isspacetable[256] = {
05694 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05696 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05701 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05702 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05703 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05704 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05705 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05706 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05707 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05708 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05709 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05710 };
05711
05712 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05713
05714
05715
05716
05717
05718
05719
05720
05721
05722
05723
05724
05725
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736
05737
05738
05739
05740
05741
05742
05743
05744
05745
05746
05747
05748
05749
05750
05751
05752
05753
05754
05755
05756 static VALUE
05757 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05758 {
05759 rb_encoding *enc;
05760 VALUE spat;
05761 VALUE limit;
05762 enum {awk, string, regexp} split_type;
05763 long beg, end, i = 0;
05764 int lim = 0;
05765 VALUE result, tmp;
05766
05767 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05768 lim = NUM2INT(limit);
05769 if (lim <= 0) limit = Qnil;
05770 else if (lim == 1) {
05771 if (RSTRING_LEN(str) == 0)
05772 return rb_ary_new2(0);
05773 return rb_ary_new3(1, str);
05774 }
05775 i = 1;
05776 }
05777
05778 enc = STR_ENC_GET(str);
05779 if (NIL_P(spat)) {
05780 if (!NIL_P(rb_fs)) {
05781 spat = rb_fs;
05782 goto fs_set;
05783 }
05784 split_type = awk;
05785 }
05786 else {
05787 fs_set:
05788 if (TYPE(spat) == T_STRING) {
05789 rb_encoding *enc2 = STR_ENC_GET(spat);
05790
05791 split_type = string;
05792 if (RSTRING_LEN(spat) == 0) {
05793
05794 spat = rb_reg_regcomp(spat);
05795 split_type = regexp;
05796 }
05797 else if (rb_enc_asciicompat(enc2) == 1) {
05798 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05799 split_type = awk;
05800 }
05801 }
05802 else {
05803 int l;
05804 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05805 RSTRING_LEN(spat) == l) {
05806 split_type = awk;
05807 }
05808 }
05809 }
05810 else {
05811 spat = get_pat(spat, 1);
05812 split_type = regexp;
05813 }
05814 }
05815
05816 result = rb_ary_new();
05817 beg = 0;
05818 if (split_type == awk) {
05819 char *ptr = RSTRING_PTR(str);
05820 char *eptr = RSTRING_END(str);
05821 char *bptr = ptr;
05822 int skip = 1;
05823 unsigned int c;
05824
05825 end = beg;
05826 if (is_ascii_string(str)) {
05827 while (ptr < eptr) {
05828 c = (unsigned char)*ptr++;
05829 if (skip) {
05830 if (ascii_isspace(c)) {
05831 beg = ptr - bptr;
05832 }
05833 else {
05834 end = ptr - bptr;
05835 skip = 0;
05836 if (!NIL_P(limit) && lim <= i) break;
05837 }
05838 }
05839 else if (ascii_isspace(c)) {
05840 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05841 skip = 1;
05842 beg = ptr - bptr;
05843 if (!NIL_P(limit)) ++i;
05844 }
05845 else {
05846 end = ptr - bptr;
05847 }
05848 }
05849 }
05850 else {
05851 while (ptr < eptr) {
05852 int n;
05853
05854 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05855 ptr += n;
05856 if (skip) {
05857 if (rb_isspace(c)) {
05858 beg = ptr - bptr;
05859 }
05860 else {
05861 end = ptr - bptr;
05862 skip = 0;
05863 if (!NIL_P(limit) && lim <= i) break;
05864 }
05865 }
05866 else if (rb_isspace(c)) {
05867 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05868 skip = 1;
05869 beg = ptr - bptr;
05870 if (!NIL_P(limit)) ++i;
05871 }
05872 else {
05873 end = ptr - bptr;
05874 }
05875 }
05876 }
05877 }
05878 else if (split_type == string) {
05879 char *ptr = RSTRING_PTR(str);
05880 char *temp = ptr;
05881 char *eptr = RSTRING_END(str);
05882 char *sptr = RSTRING_PTR(spat);
05883 long slen = RSTRING_LEN(spat);
05884
05885 if (is_broken_string(str)) {
05886 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05887 }
05888 if (is_broken_string(spat)) {
05889 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05890 }
05891 enc = rb_enc_check(str, spat);
05892 while (ptr < eptr &&
05893 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05894
05895 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05896 if (t != ptr + end) {
05897 ptr = t;
05898 continue;
05899 }
05900 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05901 ptr += end + slen;
05902 if (!NIL_P(limit) && lim <= ++i) break;
05903 }
05904 beg = ptr - temp;
05905 }
05906 else {
05907 char *ptr = RSTRING_PTR(str);
05908 long len = RSTRING_LEN(str);
05909 long start = beg;
05910 long idx;
05911 int last_null = 0;
05912 struct re_registers *regs;
05913
05914 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05915 regs = RMATCH_REGS(rb_backref_get());
05916 if (start == end && BEG(0) == END(0)) {
05917 if (!ptr) {
05918 rb_ary_push(result, str_new_empty(str));
05919 break;
05920 }
05921 else if (last_null == 1) {
05922 rb_ary_push(result, rb_str_subseq(str, beg,
05923 rb_enc_fast_mbclen(ptr+beg,
05924 ptr+len,
05925 enc)));
05926 beg = start;
05927 }
05928 else {
05929 if (ptr+start == ptr+len)
05930 start++;
05931 else
05932 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05933 last_null = 1;
05934 continue;
05935 }
05936 }
05937 else {
05938 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05939 beg = start = END(0);
05940 }
05941 last_null = 0;
05942
05943 for (idx=1; idx < regs->num_regs; idx++) {
05944 if (BEG(idx) == -1) continue;
05945 if (BEG(idx) == END(idx))
05946 tmp = str_new_empty(str);
05947 else
05948 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05949 rb_ary_push(result, tmp);
05950 }
05951 if (!NIL_P(limit) && lim <= ++i) break;
05952 }
05953 }
05954 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05955 if (RSTRING_LEN(str) == beg)
05956 tmp = str_new_empty(str);
05957 else
05958 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05959 rb_ary_push(result, tmp);
05960 }
05961 if (NIL_P(limit) && lim == 0) {
05962 long len;
05963 while ((len = RARRAY_LEN(result)) > 0 &&
05964 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05965 rb_ary_pop(result);
05966 }
05967
05968 return result;
05969 }
05970
05971 VALUE
05972 rb_str_split(VALUE str, const char *sep0)
05973 {
05974 VALUE sep;
05975
05976 StringValue(str);
05977 sep = rb_str_new2(sep0);
05978 return rb_str_split_m(1, &sep, str);
05979 }
05980
05981
05982
05983
05984
05985
05986
05987
05988
05989
05990
05991
05992
05993
05994
05995
05996
05997
05998
05999
06000
06001
06002
06003
06004
06005
06006
06007
06008
06009
06010
06011
06012
06013
06014
06015
06016
06017
06018
06019 static VALUE
06020 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06021 {
06022 rb_encoding *enc;
06023 VALUE rs;
06024 unsigned int newline;
06025 const char *p, *pend, *s, *ptr;
06026 long len, rslen;
06027 VALUE line;
06028 int n;
06029 VALUE orig = str;
06030
06031 if (argc == 0) {
06032 rs = rb_rs;
06033 }
06034 else {
06035 rb_scan_args(argc, argv, "01", &rs);
06036 }
06037 RETURN_ENUMERATOR(str, argc, argv);
06038 if (NIL_P(rs)) {
06039 rb_yield(str);
06040 return orig;
06041 }
06042 str = rb_str_new4(str);
06043 ptr = p = s = RSTRING_PTR(str);
06044 pend = p + RSTRING_LEN(str);
06045 len = RSTRING_LEN(str);
06046 StringValue(rs);
06047 if (rs == rb_default_rs) {
06048 enc = rb_enc_get(str);
06049 while (p < pend) {
06050 char *p0;
06051
06052 p = memchr(p, '\n', pend - p);
06053 if (!p) break;
06054 p0 = rb_enc_left_char_head(s, p, pend, enc);
06055 if (!rb_enc_is_newline(p0, pend, enc)) {
06056 p++;
06057 continue;
06058 }
06059 p = p0 + rb_enc_mbclen(p0, pend, enc);
06060 line = rb_str_new5(str, s, p - s);
06061 OBJ_INFECT(line, str);
06062 rb_enc_cr_str_copy_for_substr(line, str);
06063 rb_yield(line);
06064 str_mod_check(str, ptr, len);
06065 s = p;
06066 }
06067 goto finish;
06068 }
06069
06070 enc = rb_enc_check(str, rs);
06071 rslen = RSTRING_LEN(rs);
06072 if (rslen == 0) {
06073 newline = '\n';
06074 }
06075 else {
06076 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06077 }
06078
06079 while (p < pend) {
06080 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06081
06082 again:
06083 if (rslen == 0 && c == newline) {
06084 p += n;
06085 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06086 goto again;
06087 }
06088 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06089 p += n;
06090 }
06091 p -= n;
06092 }
06093 if (c == newline &&
06094 (rslen <= 1 ||
06095 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06096 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
06097 OBJ_INFECT(line, str);
06098 rb_enc_cr_str_copy_for_substr(line, str);
06099 rb_yield(line);
06100 str_mod_check(str, ptr, len);
06101 s = p + (rslen ? rslen : n);
06102 }
06103 p += n;
06104 }
06105
06106 finish:
06107 if (s != pend) {
06108 line = rb_str_new5(str, s, pend - s);
06109 OBJ_INFECT(line, str);
06110 rb_enc_cr_str_copy_for_substr(line, str);
06111 rb_yield(line);
06112 }
06113
06114 return orig;
06115 }
06116
06117
06118
06119
06120
06121
06122
06123
06124
06125
06126
06127
06128
06129
06130
06131
06132
06133
06134
06135
06136 static VALUE
06137 rb_str_each_byte(VALUE str)
06138 {
06139 long i;
06140
06141 RETURN_ENUMERATOR(str, 0, 0);
06142 for (i=0; i<RSTRING_LEN(str); i++) {
06143 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06144 }
06145 return str;
06146 }
06147
06148
06149
06150
06151
06152
06153
06154
06155
06156
06157
06158
06159
06160
06161
06162
06163
06164
06165
06166
06167 static VALUE
06168 rb_str_each_char(VALUE str)
06169 {
06170 VALUE orig = str;
06171 long i, len, n;
06172 const char *ptr;
06173 rb_encoding *enc;
06174
06175 RETURN_ENUMERATOR(str, 0, 0);
06176 str = rb_str_new4(str);
06177 ptr = RSTRING_PTR(str);
06178 len = RSTRING_LEN(str);
06179 enc = rb_enc_get(str);
06180 switch (ENC_CODERANGE(str)) {
06181 case ENC_CODERANGE_VALID:
06182 case ENC_CODERANGE_7BIT:
06183 for (i = 0; i < len; i += n) {
06184 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06185 rb_yield(rb_str_subseq(str, i, n));
06186 }
06187 break;
06188 default:
06189 for (i = 0; i < len; i += n) {
06190 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06191 rb_yield(rb_str_subseq(str, i, n));
06192 }
06193 }
06194 return orig;
06195 }
06196
06197
06198
06199
06200
06201
06202
06203
06204
06205
06206
06207
06208
06209
06210
06211
06212
06213
06214
06215
06216
06217
06218 static VALUE
06219 rb_str_each_codepoint(VALUE str)
06220 {
06221 VALUE orig = str;
06222 int n;
06223 unsigned int c;
06224 const char *ptr, *end;
06225 rb_encoding *enc;
06226
06227 if (single_byte_optimizable(str)) return rb_str_each_byte(str);
06228 RETURN_ENUMERATOR(str, 0, 0);
06229 str = rb_str_new4(str);
06230 ptr = RSTRING_PTR(str);
06231 end = RSTRING_END(str);
06232 enc = STR_ENC_GET(str);
06233 while (ptr < end) {
06234 c = rb_enc_codepoint_len(ptr, end, &n, enc);
06235 rb_yield(UINT2NUM(c));
06236 ptr += n;
06237 }
06238 return orig;
06239 }
06240
06241 static long
06242 chopped_length(VALUE str)
06243 {
06244 rb_encoding *enc = STR_ENC_GET(str);
06245 const char *p, *p2, *beg, *end;
06246
06247 beg = RSTRING_PTR(str);
06248 end = beg + RSTRING_LEN(str);
06249 if (beg > end) return 0;
06250 p = rb_enc_prev_char(beg, end, end, enc);
06251 if (!p) return 0;
06252 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06253 p2 = rb_enc_prev_char(beg, p, end, enc);
06254 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06255 }
06256 return p - beg;
06257 }
06258
06259
06260
06261
06262
06263
06264
06265
06266
06267
06268 static VALUE
06269 rb_str_chop_bang(VALUE str)
06270 {
06271 str_modify_keep_cr(str);
06272 if (RSTRING_LEN(str) > 0) {
06273 long len;
06274 len = chopped_length(str);
06275 STR_SET_LEN(str, len);
06276 RSTRING_PTR(str)[len] = '\0';
06277 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06278 ENC_CODERANGE_CLEAR(str);
06279 }
06280 return str;
06281 }
06282 return Qnil;
06283 }
06284
06285
06286
06287
06288
06289
06290
06291
06292
06293
06294
06295
06296
06297
06298
06299
06300
06301
06302
06303 static VALUE
06304 rb_str_chop(VALUE str)
06305 {
06306 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06307 rb_enc_cr_str_copy_for_substr(str2, str);
06308 OBJ_INFECT(str2, str);
06309 return str2;
06310 }
06311
06312
06313
06314
06315
06316
06317
06318
06319
06320
06321 static VALUE
06322 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06323 {
06324 rb_encoding *enc;
06325 VALUE rs;
06326 int newline;
06327 char *p, *pp, *e;
06328 long len, rslen;
06329
06330 str_modify_keep_cr(str);
06331 len = RSTRING_LEN(str);
06332 if (len == 0) return Qnil;
06333 p = RSTRING_PTR(str);
06334 e = p + len;
06335 if (argc == 0) {
06336 rs = rb_rs;
06337 if (rs == rb_default_rs) {
06338 smart_chomp:
06339 enc = rb_enc_get(str);
06340 if (rb_enc_mbminlen(enc) > 1) {
06341 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06342 if (rb_enc_is_newline(pp, e, enc)) {
06343 e = pp;
06344 }
06345 pp = e - rb_enc_mbminlen(enc);
06346 if (pp >= p) {
06347 pp = rb_enc_left_char_head(p, pp, e, enc);
06348 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06349 e = pp;
06350 }
06351 }
06352 if (e == RSTRING_END(str)) {
06353 return Qnil;
06354 }
06355 len = e - RSTRING_PTR(str);
06356 STR_SET_LEN(str, len);
06357 }
06358 else {
06359 if (RSTRING_PTR(str)[len-1] == '\n') {
06360 STR_DEC_LEN(str);
06361 if (RSTRING_LEN(str) > 0 &&
06362 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06363 STR_DEC_LEN(str);
06364 }
06365 }
06366 else if (RSTRING_PTR(str)[len-1] == '\r') {
06367 STR_DEC_LEN(str);
06368 }
06369 else {
06370 return Qnil;
06371 }
06372 }
06373 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06374 return str;
06375 }
06376 }
06377 else {
06378 rb_scan_args(argc, argv, "01", &rs);
06379 }
06380 if (NIL_P(rs)) return Qnil;
06381 StringValue(rs);
06382 rslen = RSTRING_LEN(rs);
06383 if (rslen == 0) {
06384 while (len>0 && p[len-1] == '\n') {
06385 len--;
06386 if (len>0 && p[len-1] == '\r')
06387 len--;
06388 }
06389 if (len < RSTRING_LEN(str)) {
06390 STR_SET_LEN(str, len);
06391 RSTRING_PTR(str)[len] = '\0';
06392 return str;
06393 }
06394 return Qnil;
06395 }
06396 if (rslen > len) return Qnil;
06397 newline = RSTRING_PTR(rs)[rslen-1];
06398 if (rslen == 1 && newline == '\n')
06399 goto smart_chomp;
06400
06401 enc = rb_enc_check(str, rs);
06402 if (is_broken_string(rs)) {
06403 return Qnil;
06404 }
06405 pp = e - rslen;
06406 if (p[len-1] == newline &&
06407 (rslen <= 1 ||
06408 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06409 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06410 return Qnil;
06411 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06412 ENC_CODERANGE_CLEAR(str);
06413 }
06414 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06415 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06416 return str;
06417 }
06418 return Qnil;
06419 }
06420
06421
06422
06423
06424
06425
06426
06427
06428
06429
06430
06431
06432
06433
06434
06435
06436
06437
06438
06439
06440
06441 static VALUE
06442 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06443 {
06444 str = rb_str_dup(str);
06445 rb_str_chomp_bang(argc, argv, str);
06446 return str;
06447 }
06448
06449
06450
06451
06452
06453
06454
06455
06456
06457
06458
06459
06460
06461 static VALUE
06462 rb_str_lstrip_bang(VALUE str)
06463 {
06464 rb_encoding *enc;
06465 char *s, *t, *e;
06466
06467 str_modify_keep_cr(str);
06468 enc = STR_ENC_GET(str);
06469 s = RSTRING_PTR(str);
06470 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06471 e = t = RSTRING_END(str);
06472
06473 while (s < e) {
06474 int n;
06475 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06476
06477 if (!rb_isspace(cc)) break;
06478 s += n;
06479 }
06480
06481 if (s > RSTRING_PTR(str)) {
06482 STR_SET_LEN(str, t-s);
06483 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06484 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06485 return str;
06486 }
06487 return Qnil;
06488 }
06489
06490
06491
06492
06493
06494
06495
06496
06497
06498
06499
06500
06501
06502 static VALUE
06503 rb_str_lstrip(VALUE str)
06504 {
06505 str = rb_str_dup(str);
06506 rb_str_lstrip_bang(str);
06507 return str;
06508 }
06509
06510
06511
06512
06513
06514
06515
06516
06517
06518
06519
06520
06521
06522
06523 static VALUE
06524 rb_str_rstrip_bang(VALUE str)
06525 {
06526 rb_encoding *enc;
06527 char *s, *t, *e;
06528
06529 str_modify_keep_cr(str);
06530 enc = STR_ENC_GET(str);
06531 rb_str_check_dummy_enc(enc);
06532 s = RSTRING_PTR(str);
06533 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06534 t = e = RSTRING_END(str);
06535
06536
06537 if (single_byte_optimizable(str)) {
06538 unsigned char c;
06539 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06540 }
06541 else {
06542 char *tp;
06543
06544 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06545 unsigned int c = rb_enc_codepoint(tp, e, enc);
06546 if (c && !rb_isspace(c)) break;
06547 t = tp;
06548 }
06549 }
06550 if (t < e) {
06551 long len = t-RSTRING_PTR(str);
06552
06553 STR_SET_LEN(str, len);
06554 RSTRING_PTR(str)[len] = '\0';
06555 return str;
06556 }
06557 return Qnil;
06558 }
06559
06560
06561
06562
06563
06564
06565
06566
06567
06568
06569
06570
06571
06572 static VALUE
06573 rb_str_rstrip(VALUE str)
06574 {
06575 str = rb_str_dup(str);
06576 rb_str_rstrip_bang(str);
06577 return str;
06578 }
06579
06580
06581
06582
06583
06584
06585
06586
06587
06588
06589 static VALUE
06590 rb_str_strip_bang(VALUE str)
06591 {
06592 VALUE l = rb_str_lstrip_bang(str);
06593 VALUE r = rb_str_rstrip_bang(str);
06594
06595 if (NIL_P(l) && NIL_P(r)) return Qnil;
06596 return str;
06597 }
06598
06599
06600
06601
06602
06603
06604
06605
06606
06607
06608
06609
06610 static VALUE
06611 rb_str_strip(VALUE str)
06612 {
06613 str = rb_str_dup(str);
06614 rb_str_strip_bang(str);
06615 return str;
06616 }
06617
06618 static VALUE
06619 scan_once(VALUE str, VALUE pat, long *start)
06620 {
06621 VALUE result, match;
06622 struct re_registers *regs;
06623 int i;
06624
06625 if (rb_reg_search(pat, str, *start, 0) >= 0) {
06626 match = rb_backref_get();
06627 regs = RMATCH_REGS(match);
06628 if (BEG(0) == END(0)) {
06629 rb_encoding *enc = STR_ENC_GET(str);
06630
06631
06632
06633 if (RSTRING_LEN(str) > END(0))
06634 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06635 RSTRING_END(str), enc);
06636 else
06637 *start = END(0)+1;
06638 }
06639 else {
06640 *start = END(0);
06641 }
06642 if (regs->num_regs == 1) {
06643 return rb_reg_nth_match(0, match);
06644 }
06645 result = rb_ary_new2(regs->num_regs);
06646 for (i=1; i < regs->num_regs; i++) {
06647 rb_ary_push(result, rb_reg_nth_match(i, match));
06648 }
06649
06650 return result;
06651 }
06652 return Qnil;
06653 }
06654
06655
06656
06657
06658
06659
06660
06661
06662
06663
06664
06665
06666
06667
06668
06669
06670
06671
06672
06673
06674
06675
06676
06677
06678
06679
06680
06681
06682
06683
06684
06685
06686
06687 static VALUE
06688 rb_str_scan(VALUE str, VALUE pat)
06689 {
06690 VALUE result;
06691 long start = 0;
06692 long last = -1, prev = 0;
06693 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06694
06695 pat = get_pat(pat, 1);
06696 if (!rb_block_given_p()) {
06697 VALUE ary = rb_ary_new();
06698
06699 while (!NIL_P(result = scan_once(str, pat, &start))) {
06700 last = prev;
06701 prev = start;
06702 rb_ary_push(ary, result);
06703 }
06704 if (last >= 0) rb_reg_search(pat, str, last, 0);
06705 return ary;
06706 }
06707
06708 while (!NIL_P(result = scan_once(str, pat, &start))) {
06709 last = prev;
06710 prev = start;
06711 rb_yield(result);
06712 str_mod_check(str, p, len);
06713 }
06714 if (last >= 0) rb_reg_search(pat, str, last, 0);
06715 return str;
06716 }
06717
06718
06719
06720
06721
06722
06723
06724
06725
06726
06727
06728
06729
06730
06731
06732
06733 static VALUE
06734 rb_str_hex(VALUE str)
06735 {
06736 rb_encoding *enc = rb_enc_get(str);
06737
06738 if (!rb_enc_asciicompat(enc)) {
06739 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06740 }
06741 return rb_str_to_inum(str, 16, FALSE);
06742 }
06743
06744
06745
06746
06747
06748
06749
06750
06751
06752
06753
06754
06755
06756
06757
06758
06759 static VALUE
06760 rb_str_oct(VALUE str)
06761 {
06762 rb_encoding *enc = rb_enc_get(str);
06763
06764 if (!rb_enc_asciicompat(enc)) {
06765 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06766 }
06767 return rb_str_to_inum(str, -8, FALSE);
06768 }
06769
06770
06771
06772
06773
06774
06775
06776
06777
06778
06779
06780
06781 static VALUE
06782 rb_str_crypt(VALUE str, VALUE salt)
06783 {
06784 extern char *crypt(const char *, const char *);
06785 VALUE result;
06786 const char *s, *saltp;
06787 #ifdef BROKEN_CRYPT
06788 char salt_8bit_clean[3];
06789 #endif
06790
06791 StringValue(salt);
06792 if (RSTRING_LEN(salt) < 2)
06793 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06794
06795 s = RSTRING_PTR(str);
06796 if (!s) s = "";
06797 saltp = RSTRING_PTR(salt);
06798 #ifdef BROKEN_CRYPT
06799 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06800 salt_8bit_clean[0] = saltp[0] & 0x7f;
06801 salt_8bit_clean[1] = saltp[1] & 0x7f;
06802 salt_8bit_clean[2] = '\0';
06803 saltp = salt_8bit_clean;
06804 }
06805 #endif
06806 result = rb_str_new2(crypt(s, saltp));
06807 OBJ_INFECT(result, str);
06808 OBJ_INFECT(result, salt);
06809 return result;
06810 }
06811
06812
06813
06814
06815
06816
06817
06818
06819
06820
06821
06822
06823
06824
06825
06826
06827
06828
06829
06830
06831
06832
06833 VALUE
06834 rb_str_intern(VALUE s)
06835 {
06836 VALUE str = RB_GC_GUARD(s);
06837 ID id;
06838
06839 id = rb_intern_str(str);
06840 return ID2SYM(id);
06841 }
06842
06843
06844
06845
06846
06847
06848
06849
06850
06851
06852
06853 VALUE
06854 rb_str_ord(VALUE s)
06855 {
06856 unsigned int c;
06857
06858 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06859 return UINT2NUM(c);
06860 }
06861
06862
06863
06864
06865
06866
06867
06868
06869
06870
06871
06872 static VALUE
06873 rb_str_sum(int argc, VALUE *argv, VALUE str)
06874 {
06875 VALUE vbits;
06876 int bits;
06877 char *ptr, *p, *pend;
06878 long len;
06879 VALUE sum = INT2FIX(0);
06880 unsigned long sum0 = 0;
06881
06882 if (argc == 0) {
06883 bits = 16;
06884 }
06885 else {
06886 rb_scan_args(argc, argv, "01", &vbits);
06887 bits = NUM2INT(vbits);
06888 }
06889 ptr = p = RSTRING_PTR(str);
06890 len = RSTRING_LEN(str);
06891 pend = p + len;
06892
06893 while (p < pend) {
06894 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06895 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06896 str_mod_check(str, ptr, len);
06897 sum0 = 0;
06898 }
06899 sum0 += (unsigned char)*p;
06900 p++;
06901 }
06902
06903 if (bits == 0) {
06904 if (sum0) {
06905 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06906 }
06907 }
06908 else {
06909 if (sum == INT2FIX(0)) {
06910 if (bits < (int)sizeof(long)*CHAR_BIT) {
06911 sum0 &= (((unsigned long)1)<<bits)-1;
06912 }
06913 sum = LONG2FIX(sum0);
06914 }
06915 else {
06916 VALUE mod;
06917
06918 if (sum0) {
06919 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06920 }
06921
06922 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06923 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06924 sum = rb_funcall(sum, '&', 1, mod);
06925 }
06926 }
06927 return sum;
06928 }
06929
06930 static VALUE
06931 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06932 {
06933 rb_encoding *enc;
06934 VALUE w;
06935 long width, len, flen = 1, fclen = 1;
06936 VALUE res;
06937 char *p;
06938 const char *f = " ";
06939 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06940 volatile VALUE pad;
06941 int singlebyte = 1, cr;
06942
06943 rb_scan_args(argc, argv, "11", &w, &pad);
06944 enc = STR_ENC_GET(str);
06945 width = NUM2LONG(w);
06946 if (argc == 2) {
06947 StringValue(pad);
06948 enc = rb_enc_check(str, pad);
06949 f = RSTRING_PTR(pad);
06950 flen = RSTRING_LEN(pad);
06951 fclen = str_strlen(pad, enc);
06952 singlebyte = single_byte_optimizable(pad);
06953 if (flen == 0 || fclen == 0) {
06954 rb_raise(rb_eArgError, "zero width padding");
06955 }
06956 }
06957 len = str_strlen(str, enc);
06958 if (width < 0 || len >= width) return rb_str_dup(str);
06959 n = width - len;
06960 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06961 rlen = n - llen;
06962 cr = ENC_CODERANGE(str);
06963 if (flen > 1) {
06964 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06965 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06966 }
06967 size = RSTRING_LEN(str);
06968 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06969 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06970 (len += llen2 + rlen2) >= LONG_MAX - size) {
06971 rb_raise(rb_eArgError, "argument too big");
06972 }
06973 len += size;
06974 res = rb_str_new5(str, 0, len);
06975 p = RSTRING_PTR(res);
06976 if (flen <= 1) {
06977 memset(p, *f, llen);
06978 p += llen;
06979 }
06980 else {
06981 while (llen >= fclen) {
06982 memcpy(p,f,flen);
06983 p += flen;
06984 llen -= fclen;
06985 }
06986 if (llen > 0) {
06987 memcpy(p, f, llen2);
06988 p += llen2;
06989 }
06990 }
06991 memcpy(p, RSTRING_PTR(str), size);
06992 p += size;
06993 if (flen <= 1) {
06994 memset(p, *f, rlen);
06995 p += rlen;
06996 }
06997 else {
06998 while (rlen >= fclen) {
06999 memcpy(p,f,flen);
07000 p += flen;
07001 rlen -= fclen;
07002 }
07003 if (rlen > 0) {
07004 memcpy(p, f, rlen2);
07005 p += rlen2;
07006 }
07007 }
07008 *p = '\0';
07009 STR_SET_LEN(res, p-RSTRING_PTR(res));
07010 OBJ_INFECT(res, str);
07011 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07012 rb_enc_associate(res, enc);
07013 if (argc == 2)
07014 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07015 if (cr != ENC_CODERANGE_BROKEN)
07016 ENC_CODERANGE_SET(res, cr);
07017 return res;
07018 }
07019
07020
07021
07022
07023
07024
07025
07026
07027
07028
07029
07030
07031
07032
07033
07034 static VALUE
07035 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07036 {
07037 return rb_str_justify(argc, argv, str, 'l');
07038 }
07039
07040
07041
07042
07043
07044
07045
07046
07047
07048
07049
07050
07051
07052
07053
07054 static VALUE
07055 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07056 {
07057 return rb_str_justify(argc, argv, str, 'r');
07058 }
07059
07060
07061
07062
07063
07064
07065
07066
07067
07068
07069
07070
07071
07072
07073
07074 static VALUE
07075 rb_str_center(int argc, VALUE *argv, VALUE str)
07076 {
07077 return rb_str_justify(argc, argv, str, 'c');
07078 }
07079
07080
07081
07082
07083
07084
07085
07086
07087
07088
07089
07090
07091
07092
07093
07094
07095 static VALUE
07096 rb_str_partition(VALUE str, VALUE sep)
07097 {
07098 long pos;
07099 int regex = FALSE;
07100
07101 if (TYPE(sep) == T_REGEXP) {
07102 pos = rb_reg_search(sep, str, 0, 0);
07103 regex = TRUE;
07104 }
07105 else {
07106 VALUE tmp;
07107
07108 tmp = rb_check_string_type(sep);
07109 if (NIL_P(tmp)) {
07110 rb_raise(rb_eTypeError, "type mismatch: %s given",
07111 rb_obj_classname(sep));
07112 }
07113 sep = tmp;
07114 pos = rb_str_index(str, sep, 0);
07115 }
07116 if (pos < 0) {
07117 failed:
07118 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07119 }
07120 if (regex) {
07121 sep = rb_str_subpat(str, sep, INT2FIX(0));
07122 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07123 }
07124 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07125 sep,
07126 rb_str_subseq(str, pos+RSTRING_LEN(sep),
07127 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07128 }
07129
07130
07131
07132
07133
07134
07135
07136
07137
07138
07139
07140
07141
07142
07143
07144
07145 static VALUE
07146 rb_str_rpartition(VALUE str, VALUE sep)
07147 {
07148 long pos = RSTRING_LEN(str);
07149 int regex = FALSE;
07150
07151 if (TYPE(sep) == T_REGEXP) {
07152 pos = rb_reg_search(sep, str, pos, 1);
07153 regex = TRUE;
07154 }
07155 else {
07156 VALUE tmp;
07157
07158 tmp = rb_check_string_type(sep);
07159 if (NIL_P(tmp)) {
07160 rb_raise(rb_eTypeError, "type mismatch: %s given",
07161 rb_obj_classname(sep));
07162 }
07163 sep = tmp;
07164 pos = rb_str_sublen(str, pos);
07165 pos = rb_str_rindex(str, sep, pos);
07166 }
07167 if (pos < 0) {
07168 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07169 }
07170 if (regex) {
07171 sep = rb_reg_nth_match(0, rb_backref_get());
07172 }
07173 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07174 sep,
07175 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07176 }
07177
07178
07179
07180
07181
07182
07183
07184
07185
07186
07187
07188
07189
07190
07191
07192
07193
07194 static VALUE
07195 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07196 {
07197 int i;
07198
07199 for (i=0; i<argc; i++) {
07200 VALUE tmp = rb_check_string_type(argv[i]);
07201 if (NIL_P(tmp)) continue;
07202 rb_enc_check(str, tmp);
07203 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07204 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07205 return Qtrue;
07206 }
07207 return Qfalse;
07208 }
07209
07210
07211
07212
07213
07214
07215
07216
07217 static VALUE
07218 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07219 {
07220 int i;
07221 char *p, *s, *e;
07222 rb_encoding *enc;
07223
07224 for (i=0; i<argc; i++) {
07225 VALUE tmp = rb_check_string_type(argv[i]);
07226 if (NIL_P(tmp)) continue;
07227 enc = rb_enc_check(str, tmp);
07228 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07229 p = RSTRING_PTR(str);
07230 e = p + RSTRING_LEN(str);
07231 s = e - RSTRING_LEN(tmp);
07232 if (rb_enc_left_char_head(p, s, e, enc) != s)
07233 continue;
07234 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07235 return Qtrue;
07236 }
07237 return Qfalse;
07238 }
07239
07240 void
07241 rb_str_setter(VALUE val, ID id, VALUE *var)
07242 {
07243 if (!NIL_P(val) && TYPE(val) != T_STRING) {
07244 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07245 }
07246 *var = val;
07247 }
07248
07249
07250
07251
07252
07253
07254
07255
07256
07257 static VALUE
07258 rb_str_force_encoding(VALUE str, VALUE enc)
07259 {
07260 str_modifiable(str);
07261 rb_enc_associate(str, rb_to_encoding(enc));
07262 ENC_CODERANGE_CLEAR(str);
07263 return str;
07264 }
07265
07266
07267
07268
07269
07270
07271
07272
07273
07274
07275
07276
07277 static VALUE
07278 rb_str_valid_encoding_p(VALUE str)
07279 {
07280 int cr = rb_enc_str_coderange(str);
07281
07282 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07283 }
07284
07285
07286
07287
07288
07289
07290
07291
07292
07293
07294
07295 static VALUE
07296 rb_str_is_ascii_only_p(VALUE str)
07297 {
07298 int cr = rb_enc_str_coderange(str);
07299
07300 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07301 }
07302
07317 VALUE
07318 rb_str_ellipsize(VALUE str, long len)
07319 {
07320 static const char ellipsis[] = "...";
07321 const long ellipsislen = sizeof(ellipsis) - 1;
07322 rb_encoding *const enc = rb_enc_get(str);
07323 const long blen = RSTRING_LEN(str);
07324 const char *const p = RSTRING_PTR(str), *e = p + blen;
07325 VALUE estr, ret = 0;
07326
07327 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07328 if (len * rb_enc_mbminlen(enc) >= blen ||
07329 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07330 ret = str;
07331 }
07332 else if (len <= ellipsislen ||
07333 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07334 if (rb_enc_asciicompat(enc)) {
07335 ret = rb_str_new_with_class(str, ellipsis, len);
07336 rb_enc_associate(ret, enc);
07337 }
07338 else {
07339 estr = rb_usascii_str_new(ellipsis, len);
07340 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07341 }
07342 }
07343 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07344 rb_str_cat(ret, ellipsis, ellipsislen);
07345 }
07346 else {
07347 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07348 rb_enc_from_encoding(enc), 0, Qnil);
07349 rb_str_append(ret, estr);
07350 }
07351 return ret;
07352 }
07353
07354
07355
07356
07357
07358
07359
07360
07361
07362
07363
07364
07365
07366
07367
07368
07369
07370
07371
07372
07373
07374
07375
07376
07377
07378
07379
07380
07381
07382
07383
07384
07385
07386
07387
07388
07389
07390
07391
07392
07393
07394
07395
07396 static VALUE
07397 sym_equal(VALUE sym1, VALUE sym2)
07398 {
07399 if (sym1 == sym2) return Qtrue;
07400 return Qfalse;
07401 }
07402
07403
07404 static int
07405 sym_printable(const char *s, const char *send, rb_encoding *enc)
07406 {
07407 while (s < send) {
07408 int n;
07409 int c = rb_enc_codepoint_len(s, send, &n, enc);
07410
07411 if (!rb_enc_isprint(c, enc)) return FALSE;
07412 s += n;
07413 }
07414 return TRUE;
07415 }
07416
07417
07418
07419
07420
07421
07422
07423
07424
07425
07426 static VALUE
07427 sym_inspect(VALUE sym)
07428 {
07429 VALUE str;
07430 ID id = SYM2ID(sym);
07431 rb_encoding *enc;
07432 const char *ptr;
07433 long len;
07434 char *dest;
07435 rb_encoding *resenc = rb_default_internal_encoding();
07436
07437 if (resenc == NULL) resenc = rb_default_external_encoding();
07438 sym = rb_id2str(id);
07439 enc = STR_ENC_GET(sym);
07440 ptr = RSTRING_PTR(sym);
07441 len = RSTRING_LEN(sym);
07442 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07443 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07444 str = rb_str_inspect(sym);
07445 len = RSTRING_LEN(str);
07446 rb_str_resize(str, len + 1);
07447 dest = RSTRING_PTR(str);
07448 memmove(dest + 1, dest, len);
07449 dest[0] = ':';
07450 }
07451 else {
07452 char *dest;
07453 str = rb_enc_str_new(0, len + 1, enc);
07454 dest = RSTRING_PTR(str);
07455 dest[0] = ':';
07456 memcpy(dest + 1, ptr, len);
07457 }
07458 return str;
07459 }
07460
07461
07462
07463
07464
07465
07466
07467
07468
07469
07470
07471
07472
07473 VALUE
07474 rb_sym_to_s(VALUE sym)
07475 {
07476 ID id = SYM2ID(sym);
07477
07478 return str_new3(rb_cString, rb_id2str(id));
07479 }
07480
07481
07482
07483
07484
07485
07486
07487
07488
07489
07490
07491
07492 static VALUE
07493 sym_to_sym(VALUE sym)
07494 {
07495 return sym;
07496 }
07497
07498 static VALUE
07499 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07500 {
07501 VALUE obj;
07502
07503 if (argc < 1) {
07504 rb_raise(rb_eArgError, "no receiver given");
07505 }
07506 obj = argv[0];
07507 return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07508 }
07509
07510
07511
07512
07513
07514
07515
07516
07517
07518
07519 static VALUE
07520 sym_to_proc(VALUE sym)
07521 {
07522 static VALUE sym_proc_cache = Qfalse;
07523 enum {SYM_PROC_CACHE_SIZE = 67};
07524 VALUE proc;
07525 long id, index;
07526 VALUE *aryp;
07527
07528 if (!sym_proc_cache) {
07529 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07530 rb_gc_register_mark_object(sym_proc_cache);
07531 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07532 }
07533
07534 id = SYM2ID(sym);
07535 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07536
07537 aryp = RARRAY_PTR(sym_proc_cache);
07538 if (aryp[index] == sym) {
07539 return aryp[index + 1];
07540 }
07541 else {
07542 proc = rb_proc_new(sym_call, (VALUE)id);
07543 aryp[index] = sym;
07544 aryp[index + 1] = proc;
07545 return proc;
07546 }
07547 }
07548
07549
07550
07551
07552
07553
07554
07555
07556
07557 static VALUE
07558 sym_succ(VALUE sym)
07559 {
07560 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07561 }
07562
07563
07564
07565
07566
07567
07568
07569
07570
07571 static VALUE
07572 sym_cmp(VALUE sym, VALUE other)
07573 {
07574 if (!SYMBOL_P(other)) {
07575 return Qnil;
07576 }
07577 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07578 }
07579
07580
07581
07582
07583
07584
07585
07586
07587
07588 static VALUE
07589 sym_casecmp(VALUE sym, VALUE other)
07590 {
07591 if (!SYMBOL_P(other)) {
07592 return Qnil;
07593 }
07594 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07595 }
07596
07597
07598
07599
07600
07601
07602
07603
07604 static VALUE
07605 sym_match(VALUE sym, VALUE other)
07606 {
07607 return rb_str_match(rb_sym_to_s(sym), other);
07608 }
07609
07610
07611
07612
07613
07614
07615
07616
07617
07618 static VALUE
07619 sym_aref(int argc, VALUE *argv, VALUE sym)
07620 {
07621 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07622 }
07623
07624
07625
07626
07627
07628
07629
07630
07631 static VALUE
07632 sym_length(VALUE sym)
07633 {
07634 return rb_str_length(rb_id2str(SYM2ID(sym)));
07635 }
07636
07637
07638
07639
07640
07641
07642
07643
07644 static VALUE
07645 sym_empty(VALUE sym)
07646 {
07647 return rb_str_empty(rb_id2str(SYM2ID(sym)));
07648 }
07649
07650
07651
07652
07653
07654
07655
07656
07657 static VALUE
07658 sym_upcase(VALUE sym)
07659 {
07660 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07661 }
07662
07663
07664
07665
07666
07667
07668
07669
07670 static VALUE
07671 sym_downcase(VALUE sym)
07672 {
07673 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07674 }
07675
07676
07677
07678
07679
07680
07681
07682
07683 static VALUE
07684 sym_capitalize(VALUE sym)
07685 {
07686 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07687 }
07688
07689
07690
07691
07692
07693
07694
07695
07696 static VALUE
07697 sym_swapcase(VALUE sym)
07698 {
07699 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07700 }
07701
07702
07703
07704
07705
07706
07707
07708
07709 static VALUE
07710 sym_encoding(VALUE sym)
07711 {
07712 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07713 }
07714
07715 ID
07716 rb_to_id(VALUE name)
07717 {
07718 VALUE tmp;
07719
07720 switch (TYPE(name)) {
07721 default:
07722 tmp = rb_check_string_type(name);
07723 if (NIL_P(tmp)) {
07724 tmp = rb_inspect(name);
07725 rb_raise(rb_eTypeError, "%s is not a symbol",
07726 RSTRING_PTR(tmp));
07727 }
07728 name = tmp;
07729
07730 case T_STRING:
07731 name = rb_str_intern(name);
07732
07733 case T_SYMBOL:
07734 return SYM2ID(name);
07735 }
07736 return Qnil;
07737 }
07738
07739
07740
07741
07742
07743
07744
07745
07746
07747
07748
07749
07750
07751
07752 void
07753 Init_String(void)
07754 {
07755 #undef rb_intern
07756 #define rb_intern(str) rb_intern_const(str)
07757
07758 rb_cString = rb_define_class("String", rb_cObject);
07759 rb_include_module(rb_cString, rb_mComparable);
07760 rb_define_alloc_func(rb_cString, str_alloc);
07761 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07762 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07763 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07764 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07765 rb_define_method(rb_cString, "==", rb_str_equal, 1);
07766 rb_define_method(rb_cString, "===", rb_str_equal, 1);
07767 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07768 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07769 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07770 rb_define_method(rb_cString, "+", rb_str_plus, 1);
07771 rb_define_method(rb_cString, "*", rb_str_times, 1);
07772 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07773 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07774 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07775 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07776 rb_define_method(rb_cString, "length", rb_str_length, 0);
07777 rb_define_method(rb_cString, "size", rb_str_length, 0);
07778 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07779 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07780 rb_define_method(rb_cString, "=~", rb_str_match, 1);
07781 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07782 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07783 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07784 rb_define_method(rb_cString, "next", rb_str_succ, 0);
07785 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07786 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07787 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07788 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07789 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07790 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07791 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07792 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07793 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07794 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
07795
07796 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07797 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07798 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07799 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07800 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07801 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07802
07803 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07804 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07805 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07806 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07807
07808 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07809 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07810 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07811 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07812
07813 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07814 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07815 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07816 rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07817 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07818 rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07819 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07820 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07821 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07822 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07823 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07824 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
07825 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07826 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07827 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07828 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07829
07830 rb_define_method(rb_cString, "include?", rb_str_include, 1);
07831 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07832 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07833
07834 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07835
07836 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07837 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07838 rb_define_method(rb_cString, "center", rb_str_center, -1);
07839
07840 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07841 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07842 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07843 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07844 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07845 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07846 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07847
07848 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07849 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07850 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07851 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07852 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07853 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07854 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07855
07856 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07857 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07858 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07859 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07860 rb_define_method(rb_cString, "count", rb_str_count, -1);
07861
07862 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07863 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07864 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07865 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07866
07867 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07868 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07869 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07870 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07871
07872 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07873
07874 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07875 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07876
07877 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07878 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07879
07880 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
07881 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07882 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07883 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07884
07885 id_to_s = rb_intern("to_s");
07886
07887 rb_fs = Qnil;
07888 rb_define_variable("$;", &rb_fs);
07889 rb_define_variable("$-F", &rb_fs);
07890
07891 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07892 rb_include_module(rb_cSymbol, rb_mComparable);
07893 rb_undef_alloc_func(rb_cSymbol);
07894 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07895 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
07896
07897 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07898 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07899 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07900 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07901 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07902 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07903 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07904 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07905 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07906 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07907
07908 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07909 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07910 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07911
07912 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07913 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07914 rb_define_method(rb_cSymbol, "length", sym_length, 0);
07915 rb_define_method(rb_cSymbol, "size", sym_length, 0);
07916 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07917 rb_define_method(rb_cSymbol, "match", sym_match, 1);
07918
07919 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07920 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07921 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07922 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07923
07924 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07925 }
07926