00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014 #include "ruby/ruby.h"
00015 #include "ruby/re.h"
00016 #include "ruby/encoding.h"
00017 #include "internal.h"
00018 #include <assert.h>
00019
00020 #define BEG(no) (regs->beg[(no)])
00021 #define END(no) (regs->end[(no)])
00022
00023 #include <math.h>
00024 #include <ctype.h>
00025
00026 #ifdef HAVE_UNISTD_H
00027 #include <unistd.h>
00028 #endif
00029
00030 #define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
00031
00032 #undef rb_str_new_cstr
00033 #undef rb_tainted_str_new_cstr
00034 #undef rb_usascii_str_new_cstr
00035 #undef rb_external_str_new_cstr
00036 #undef rb_locale_str_new_cstr
00037 #undef rb_str_new2
00038 #undef rb_str_new3
00039 #undef rb_str_new4
00040 #undef rb_str_new5
00041 #undef rb_tainted_str_new2
00042 #undef rb_usascii_str_new2
00043 #undef rb_str_dup_frozen
00044 #undef rb_str_buf_new_cstr
00045 #undef rb_str_buf_new2
00046 #undef rb_str_buf_cat2
00047 #undef rb_str_cat2
00048
00049 static VALUE rb_str_clear(VALUE str);
00050
00051 VALUE rb_cString;
00052 VALUE rb_cSymbol;
00053
00054 #define RUBY_MAX_CHAR_LEN 16
00055 #define STR_TMPLOCK FL_USER7
00056 #define STR_NOEMBED FL_USER1
00057 #define STR_SHARED FL_USER2
00058 #define STR_ASSOC FL_USER3
00059 #define STR_SHARED_P(s) FL_ALL((s), STR_NOEMBED|ELTS_SHARED)
00060 #define STR_ASSOC_P(s) FL_ALL((s), STR_NOEMBED|STR_ASSOC)
00061 #define STR_NOCAPA (STR_NOEMBED|ELTS_SHARED|STR_ASSOC)
00062 #define STR_NOCAPA_P(s) (FL_TEST((s),STR_NOEMBED) && FL_ANY((s),ELTS_SHARED|STR_ASSOC))
00063 #define STR_UNSET_NOCAPA(s) do {\
00064 if (FL_TEST((s),STR_NOEMBED)) FL_UNSET((s),(ELTS_SHARED|STR_ASSOC));\
00065 } while (0)
00066
00067
00068 #define STR_SET_NOEMBED(str) do {\
00069 FL_SET((str), STR_NOEMBED);\
00070 STR_SET_EMBED_LEN((str), 0);\
00071 } while (0)
00072 #define STR_SET_EMBED(str) FL_UNSET((str), STR_NOEMBED)
00073 #define STR_EMBED_P(str) (!FL_TEST((str), STR_NOEMBED))
00074 #define STR_SET_EMBED_LEN(str, n) do { \
00075 long tmp_n = (n);\
00076 RBASIC(str)->flags &= ~RSTRING_EMBED_LEN_MASK;\
00077 RBASIC(str)->flags |= (tmp_n) << RSTRING_EMBED_LEN_SHIFT;\
00078 } while (0)
00079
00080 #define STR_SET_LEN(str, n) do { \
00081 if (STR_EMBED_P(str)) {\
00082 STR_SET_EMBED_LEN((str), (n));\
00083 }\
00084 else {\
00085 RSTRING(str)->as.heap.len = (n);\
00086 }\
00087 } while (0)
00088
00089 #define STR_DEC_LEN(str) do {\
00090 if (STR_EMBED_P(str)) {\
00091 long n = RSTRING_LEN(str);\
00092 n--;\
00093 STR_SET_EMBED_LEN((str), n);\
00094 }\
00095 else {\
00096 RSTRING(str)->as.heap.len--;\
00097 }\
00098 } while (0)
00099
00100 #define RESIZE_CAPA(str,capacity) do {\
00101 if (STR_EMBED_P(str)) {\
00102 if ((capacity) > RSTRING_EMBED_LEN_MAX) {\
00103 char *tmp = ALLOC_N(char, (capacity)+1);\
00104 memcpy(tmp, RSTRING_PTR(str), RSTRING_LEN(str));\
00105 RSTRING(str)->as.heap.ptr = tmp;\
00106 RSTRING(str)->as.heap.len = RSTRING_LEN(str);\
00107 STR_SET_NOEMBED(str);\
00108 RSTRING(str)->as.heap.aux.capa = (capacity);\
00109 }\
00110 }\
00111 else {\
00112 REALLOC_N(RSTRING(str)->as.heap.ptr, char, (capacity)+1);\
00113 if (!STR_NOCAPA_P(str))\
00114 RSTRING(str)->as.heap.aux.capa = (capacity);\
00115 }\
00116 } while (0)
00117
00118 #define is_ascii_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00119 #define is_broken_string(str) (rb_enc_str_coderange(str) == ENC_CODERANGE_BROKEN)
00120
00121 #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str))
00122
00123 static inline int
00124 single_byte_optimizable(VALUE str)
00125 {
00126 rb_encoding *enc;
00127
00128
00129 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT)
00130 return 1;
00131
00132 enc = STR_ENC_GET(str);
00133 if (rb_enc_mbmaxlen(enc) == 1)
00134 return 1;
00135
00136
00137
00138 return 0;
00139 }
00140
00141 VALUE rb_fs;
00142
00143 static inline const char *
00144 search_nonascii(const char *p, const char *e)
00145 {
00146 #if SIZEOF_VALUE == 8
00147 # define NONASCII_MASK 0x8080808080808080ULL
00148 #elif SIZEOF_VALUE == 4
00149 # define NONASCII_MASK 0x80808080UL
00150 #endif
00151 #ifdef NONASCII_MASK
00152 if ((int)sizeof(VALUE) * 2 < e - p) {
00153 const VALUE *s, *t;
00154 const VALUE lowbits = sizeof(VALUE) - 1;
00155 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
00156 while (p < (const char *)s) {
00157 if (!ISASCII(*p))
00158 return p;
00159 p++;
00160 }
00161 t = (const VALUE*)(~lowbits & (VALUE)e);
00162 while (s < t) {
00163 if (*s & NONASCII_MASK) {
00164 t = s;
00165 break;
00166 }
00167 s++;
00168 }
00169 p = (const char *)t;
00170 }
00171 #endif
00172 while (p < e) {
00173 if (!ISASCII(*p))
00174 return p;
00175 p++;
00176 }
00177 return NULL;
00178 }
00179
00180 static int
00181 coderange_scan(const char *p, long len, rb_encoding *enc)
00182 {
00183 const char *e = p + len;
00184
00185 if (rb_enc_to_index(enc) == 0) {
00186
00187 p = search_nonascii(p, e);
00188 return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
00189 }
00190
00191 if (rb_enc_asciicompat(enc)) {
00192 p = search_nonascii(p, e);
00193 if (!p) {
00194 return ENC_CODERANGE_7BIT;
00195 }
00196 while (p < e) {
00197 int ret = rb_enc_precise_mbclen(p, e, enc);
00198 if (!MBCLEN_CHARFOUND_P(ret)) {
00199 return ENC_CODERANGE_BROKEN;
00200 }
00201 p += MBCLEN_CHARFOUND_LEN(ret);
00202 if (p < e) {
00203 p = search_nonascii(p, e);
00204 if (!p) {
00205 return ENC_CODERANGE_VALID;
00206 }
00207 }
00208 }
00209 if (e < p) {
00210 return ENC_CODERANGE_BROKEN;
00211 }
00212 return ENC_CODERANGE_VALID;
00213 }
00214
00215 while (p < e) {
00216 int ret = rb_enc_precise_mbclen(p, e, enc);
00217
00218 if (!MBCLEN_CHARFOUND_P(ret)) {
00219 return ENC_CODERANGE_BROKEN;
00220 }
00221 p += MBCLEN_CHARFOUND_LEN(ret);
00222 }
00223 if (e < p) {
00224 return ENC_CODERANGE_BROKEN;
00225 }
00226 return ENC_CODERANGE_VALID;
00227 }
00228
00229 long
00230 rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
00231 {
00232 const char *p = s;
00233
00234 if (*cr == ENC_CODERANGE_BROKEN)
00235 return e - s;
00236
00237 if (rb_enc_to_index(enc) == 0) {
00238
00239 p = search_nonascii(p, e);
00240 *cr = (!p && *cr != ENC_CODERANGE_VALID) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
00241 return e - s;
00242 }
00243 else if (rb_enc_asciicompat(enc)) {
00244 p = search_nonascii(p, e);
00245 if (!p) {
00246 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
00247 return e - s;
00248 }
00249 while (p < e) {
00250 int ret = rb_enc_precise_mbclen(p, e, enc);
00251 if (!MBCLEN_CHARFOUND_P(ret)) {
00252 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00253 return p - s;
00254 }
00255 p += MBCLEN_CHARFOUND_LEN(ret);
00256 if (p < e) {
00257 p = search_nonascii(p, e);
00258 if (!p) {
00259 *cr = ENC_CODERANGE_VALID;
00260 return e - s;
00261 }
00262 }
00263 }
00264 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00265 return p - s;
00266 }
00267 else {
00268 while (p < e) {
00269 int ret = rb_enc_precise_mbclen(p, e, enc);
00270 if (!MBCLEN_CHARFOUND_P(ret)) {
00271 *cr = MBCLEN_INVALID_P(ret) ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_UNKNOWN;
00272 return p - s;
00273 }
00274 p += MBCLEN_CHARFOUND_LEN(ret);
00275 }
00276 *cr = e < p ? ENC_CODERANGE_BROKEN: ENC_CODERANGE_VALID;
00277 return p - s;
00278 }
00279 }
00280
00281 static inline void
00282 str_enc_copy(VALUE str1, VALUE str2)
00283 {
00284 rb_enc_set_index(str1, ENCODING_GET(str2));
00285 }
00286
00287 static void
00288 rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
00289 {
00290
00291
00292
00293 str_enc_copy(dest, src);
00294 switch (ENC_CODERANGE(src)) {
00295 case ENC_CODERANGE_7BIT:
00296 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00297 break;
00298 case ENC_CODERANGE_VALID:
00299 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
00300 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
00301 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00302 else
00303 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00304 break;
00305 default:
00306 if (RSTRING_LEN(dest) == 0) {
00307 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
00308 ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
00309 else
00310 ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
00311 }
00312 break;
00313 }
00314 }
00315
00316 static void
00317 rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
00318 {
00319 str_enc_copy(dest, src);
00320 ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
00321 }
00322
00323 int
00324 rb_enc_str_coderange(VALUE str)
00325 {
00326 int cr = ENC_CODERANGE(str);
00327
00328 if (cr == ENC_CODERANGE_UNKNOWN) {
00329 rb_encoding *enc = STR_ENC_GET(str);
00330 cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
00331 ENC_CODERANGE_SET(str, cr);
00332 }
00333 return cr;
00334 }
00335
00336 int
00337 rb_enc_str_asciionly_p(VALUE str)
00338 {
00339 rb_encoding *enc = STR_ENC_GET(str);
00340
00341 if (!rb_enc_asciicompat(enc))
00342 return FALSE;
00343 else if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT)
00344 return TRUE;
00345 return FALSE;
00346 }
00347
00348 static inline void
00349 str_mod_check(VALUE s, const char *p, long len)
00350 {
00351 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
00352 rb_raise(rb_eRuntimeError, "string modified");
00353 }
00354 }
00355
00356 size_t
00357 rb_str_capacity(VALUE str)
00358 {
00359 if (STR_EMBED_P(str)) {
00360 return RSTRING_EMBED_LEN_MAX;
00361 }
00362 else if (STR_NOCAPA_P(str)) {
00363 return RSTRING(str)->as.heap.len;
00364 }
00365 else {
00366 return RSTRING(str)->as.heap.aux.capa;
00367 }
00368 }
00369
00370 static inline VALUE
00371 str_alloc(VALUE klass)
00372 {
00373 NEWOBJ(str, struct RString);
00374 OBJSETUP(str, klass, T_STRING);
00375
00376 str->as.heap.ptr = 0;
00377 str->as.heap.len = 0;
00378 str->as.heap.aux.capa = 0;
00379
00380 return (VALUE)str;
00381 }
00382
00383 static VALUE
00384 str_new(VALUE klass, const char *ptr, long len)
00385 {
00386 VALUE str;
00387
00388 if (len < 0) {
00389 rb_raise(rb_eArgError, "negative string size (or size too big)");
00390 }
00391
00392 str = str_alloc(klass);
00393 if (len > RSTRING_EMBED_LEN_MAX) {
00394 RSTRING(str)->as.heap.aux.capa = len;
00395 RSTRING(str)->as.heap.ptr = ALLOC_N(char,len+1);
00396 STR_SET_NOEMBED(str);
00397 }
00398 else if (len == 0) {
00399 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
00400 }
00401 if (ptr) {
00402 memcpy(RSTRING_PTR(str), ptr, len);
00403 }
00404 STR_SET_LEN(str, len);
00405 RSTRING_PTR(str)[len] = '\0';
00406 return str;
00407 }
00408
00409 VALUE
00410 rb_str_new(const char *ptr, long len)
00411 {
00412 return str_new(rb_cString, ptr, len);
00413 }
00414
00415 VALUE
00416 rb_usascii_str_new(const char *ptr, long len)
00417 {
00418 VALUE str = rb_str_new(ptr, len);
00419 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00420 return str;
00421 }
00422
00423 VALUE
00424 rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
00425 {
00426 VALUE str = rb_str_new(ptr, len);
00427 rb_enc_associate(str, enc);
00428 return str;
00429 }
00430
00431 VALUE
00432 rb_str_new_cstr(const char *ptr)
00433 {
00434 if (!ptr) {
00435 rb_raise(rb_eArgError, "NULL pointer given");
00436 }
00437 return rb_str_new(ptr, strlen(ptr));
00438 }
00439
00440 RUBY_ALIAS_FUNCTION(rb_str_new2(const char *ptr), rb_str_new_cstr, (ptr))
00441 #define rb_str_new2 rb_str_new_cstr
00442
00443 VALUE
00444 rb_usascii_str_new_cstr(const char *ptr)
00445 {
00446 VALUE str = rb_str_new2(ptr);
00447 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
00448 return str;
00449 }
00450
00451 RUBY_ALIAS_FUNCTION(rb_usascii_str_new2(const char *ptr), rb_usascii_str_new_cstr, (ptr))
00452 #define rb_usascii_str_new2 rb_usascii_str_new_cstr
00453
00454 VALUE
00455 rb_tainted_str_new(const char *ptr, long len)
00456 {
00457 VALUE str = rb_str_new(ptr, len);
00458
00459 OBJ_TAINT(str);
00460 return str;
00461 }
00462
00463 VALUE
00464 rb_tainted_str_new_cstr(const char *ptr)
00465 {
00466 VALUE str = rb_str_new2(ptr);
00467
00468 OBJ_TAINT(str);
00469 return str;
00470 }
00471
00472 RUBY_ALIAS_FUNCTION(rb_tainted_str_new2(const char *ptr), rb_tainted_str_new_cstr, (ptr))
00473 #define rb_tainted_str_new2 rb_tainted_str_new_cstr
00474
00475 VALUE
00476 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
00477 {
00478 rb_econv_t *ec;
00479 rb_econv_result_t ret;
00480 long len;
00481 VALUE newstr;
00482 const unsigned char *sp;
00483 unsigned char *dp;
00484
00485 if (!to) return str;
00486 if (from == to) return str;
00487 if ((rb_enc_asciicompat(to) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) ||
00488 to == rb_ascii8bit_encoding()) {
00489 if (STR_ENC_GET(str) != to) {
00490 str = rb_str_dup(str);
00491 rb_enc_associate(str, to);
00492 }
00493 return str;
00494 }
00495
00496 len = RSTRING_LEN(str);
00497 newstr = rb_str_new(0, len);
00498
00499 retry:
00500 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
00501 if (!ec) return str;
00502
00503 sp = (unsigned char*)RSTRING_PTR(str);
00504 dp = (unsigned char*)RSTRING_PTR(newstr);
00505 ret = rb_econv_convert(ec, &sp, (unsigned char*)RSTRING_END(str),
00506 &dp, (unsigned char*)RSTRING_END(newstr), 0);
00507 rb_econv_close(ec);
00508 switch (ret) {
00509 case econv_destination_buffer_full:
00510
00511 len = len < 2 ? 2 : len * 2;
00512 rb_str_resize(newstr, len);
00513 goto retry;
00514
00515 case econv_finished:
00516 len = dp - (unsigned char*)RSTRING_PTR(newstr);
00517 rb_str_set_len(newstr, len);
00518 rb_enc_associate(newstr, to);
00519 return newstr;
00520
00521 default:
00522
00523 return str;
00524 }
00525 }
00526
00527 VALUE
00528 rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
00529 {
00530 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
00531 }
00532
00533 VALUE
00534 rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
00535 {
00536 VALUE str;
00537
00538 str = rb_tainted_str_new(ptr, len);
00539 if (eenc == rb_usascii_encoding() &&
00540 rb_enc_str_coderange(str) != ENC_CODERANGE_7BIT) {
00541 rb_enc_associate(str, rb_ascii8bit_encoding());
00542 return str;
00543 }
00544 rb_enc_associate(str, eenc);
00545 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
00546 }
00547
00548 VALUE
00549 rb_external_str_new(const char *ptr, long len)
00550 {
00551 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
00552 }
00553
00554 VALUE
00555 rb_external_str_new_cstr(const char *ptr)
00556 {
00557 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
00558 }
00559
00560 VALUE
00561 rb_locale_str_new(const char *ptr, long len)
00562 {
00563 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
00564 }
00565
00566 VALUE
00567 rb_locale_str_new_cstr(const char *ptr)
00568 {
00569 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
00570 }
00571
00572 VALUE
00573 rb_filesystem_str_new(const char *ptr, long len)
00574 {
00575 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
00576 }
00577
00578 VALUE
00579 rb_filesystem_str_new_cstr(const char *ptr)
00580 {
00581 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
00582 }
00583
00584 VALUE
00585 rb_str_export(VALUE str)
00586 {
00587 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_default_external_encoding());
00588 }
00589
00590 VALUE
00591 rb_str_export_locale(VALUE str)
00592 {
00593 return rb_str_conv_enc(str, STR_ENC_GET(str), rb_locale_encoding());
00594 }
00595
00596 VALUE
00597 rb_str_export_to_enc(VALUE str, rb_encoding *enc)
00598 {
00599 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
00600 }
00601
00602 static VALUE
00603 str_replace_shared(VALUE str2, VALUE str)
00604 {
00605 if (RSTRING_LEN(str) <= RSTRING_EMBED_LEN_MAX) {
00606 STR_SET_EMBED(str2);
00607 memcpy(RSTRING_PTR(str2), RSTRING_PTR(str), RSTRING_LEN(str)+1);
00608 STR_SET_EMBED_LEN(str2, RSTRING_LEN(str));
00609 }
00610 else {
00611 str = rb_str_new_frozen(str);
00612 FL_SET(str2, STR_NOEMBED);
00613 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00614 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00615 RSTRING(str2)->as.heap.aux.shared = str;
00616 FL_SET(str2, ELTS_SHARED);
00617 }
00618 rb_enc_cr_str_exact_copy(str2, str);
00619
00620 return str2;
00621 }
00622
00623 static VALUE
00624 str_new_shared(VALUE klass, VALUE str)
00625 {
00626 return str_replace_shared(str_alloc(klass), str);
00627 }
00628
00629 static VALUE
00630 str_new3(VALUE klass, VALUE str)
00631 {
00632 return str_new_shared(klass, str);
00633 }
00634
00635 VALUE
00636 rb_str_new_shared(VALUE str)
00637 {
00638 VALUE str2 = str_new3(rb_obj_class(str), str);
00639
00640 OBJ_INFECT(str2, str);
00641 return str2;
00642 }
00643
00644 RUBY_ALIAS_FUNCTION(rb_str_new3(VALUE str), rb_str_new_shared, (str))
00645 #define rb_str_new3 rb_str_new_shared
00646
00647 static VALUE
00648 str_new4(VALUE klass, VALUE str)
00649 {
00650 VALUE str2;
00651
00652 str2 = str_alloc(klass);
00653 STR_SET_NOEMBED(str2);
00654 RSTRING(str2)->as.heap.len = RSTRING_LEN(str);
00655 RSTRING(str2)->as.heap.ptr = RSTRING_PTR(str);
00656 if (STR_SHARED_P(str)) {
00657 VALUE shared = RSTRING(str)->as.heap.aux.shared;
00658 assert(OBJ_FROZEN(shared));
00659 FL_SET(str2, ELTS_SHARED);
00660 RSTRING(str2)->as.heap.aux.shared = shared;
00661 }
00662 else {
00663 FL_SET(str, ELTS_SHARED);
00664 RSTRING(str)->as.heap.aux.shared = str2;
00665 }
00666 rb_enc_cr_str_exact_copy(str2, str);
00667 OBJ_INFECT(str2, str);
00668 return str2;
00669 }
00670
00671 VALUE
00672 rb_str_new_frozen(VALUE orig)
00673 {
00674 VALUE klass, str;
00675
00676 if (OBJ_FROZEN(orig)) return orig;
00677 klass = rb_obj_class(orig);
00678 if (STR_SHARED_P(orig) && (str = RSTRING(orig)->as.heap.aux.shared)) {
00679 long ofs;
00680 assert(OBJ_FROZEN(str));
00681 ofs = RSTRING_LEN(str) - RSTRING_LEN(orig);
00682 if ((ofs > 0) || (klass != RBASIC(str)->klass) ||
00683 (!OBJ_TAINTED(str) && OBJ_TAINTED(orig)) ||
00684 ENCODING_GET(str) != ENCODING_GET(orig)) {
00685 str = str_new3(klass, str);
00686 RSTRING(str)->as.heap.ptr += ofs;
00687 RSTRING(str)->as.heap.len -= ofs;
00688 rb_enc_cr_str_exact_copy(str, orig);
00689 OBJ_INFECT(str, orig);
00690 }
00691 }
00692 else if (STR_EMBED_P(orig)) {
00693 str = str_new(klass, RSTRING_PTR(orig), RSTRING_LEN(orig));
00694 rb_enc_cr_str_exact_copy(str, orig);
00695 OBJ_INFECT(str, orig);
00696 }
00697 else if (STR_ASSOC_P(orig)) {
00698 VALUE assoc = RSTRING(orig)->as.heap.aux.shared;
00699 FL_UNSET(orig, STR_ASSOC);
00700 str = str_new4(klass, orig);
00701 FL_SET(str, STR_ASSOC);
00702 RSTRING(str)->as.heap.aux.shared = assoc;
00703 }
00704 else {
00705 str = str_new4(klass, orig);
00706 }
00707 OBJ_FREEZE(str);
00708 return str;
00709 }
00710
00711 RUBY_ALIAS_FUNCTION(rb_str_new4(VALUE orig), rb_str_new_frozen, (orig))
00712 #define rb_str_new4 rb_str_new_frozen
00713
00714 VALUE
00715 rb_str_new_with_class(VALUE obj, const char *ptr, long len)
00716 {
00717 return str_new(rb_obj_class(obj), ptr, len);
00718 }
00719
00720 RUBY_ALIAS_FUNCTION(rb_str_new5(VALUE obj, const char *ptr, long len),
00721 rb_str_new_with_class, (obj, ptr, len))
00722 #define rb_str_new5 rb_str_new_with_class
00723
00724 static VALUE
00725 str_new_empty(VALUE str)
00726 {
00727 VALUE v = rb_str_new5(str, 0, 0);
00728 rb_enc_copy(v, str);
00729 OBJ_INFECT(v, str);
00730 return v;
00731 }
00732
00733 #define STR_BUF_MIN_SIZE 128
00734
00735 VALUE
00736 rb_str_buf_new(long capa)
00737 {
00738 VALUE str = str_alloc(rb_cString);
00739
00740 if (capa < STR_BUF_MIN_SIZE) {
00741 capa = STR_BUF_MIN_SIZE;
00742 }
00743 FL_SET(str, STR_NOEMBED);
00744 RSTRING(str)->as.heap.aux.capa = capa;
00745 RSTRING(str)->as.heap.ptr = ALLOC_N(char, capa+1);
00746 RSTRING(str)->as.heap.ptr[0] = '\0';
00747
00748 return str;
00749 }
00750
00751 VALUE
00752 rb_str_buf_new_cstr(const char *ptr)
00753 {
00754 VALUE str;
00755 long len = strlen(ptr);
00756
00757 str = rb_str_buf_new(len);
00758 rb_str_buf_cat(str, ptr, len);
00759
00760 return str;
00761 }
00762
00763 RUBY_ALIAS_FUNCTION(rb_str_buf_new2(const char *ptr), rb_str_buf_new_cstr, (ptr))
00764 #define rb_str_buf_new2 rb_str_buf_new_cstr
00765
00766 VALUE
00767 rb_str_tmp_new(long len)
00768 {
00769 return str_new(0, 0, len);
00770 }
00771
00772 void *
00773 rb_alloc_tmp_buffer(volatile VALUE *store, long len)
00774 {
00775 VALUE s = rb_str_tmp_new(len);
00776 *store = s;
00777 return RSTRING_PTR(s);
00778 }
00779
00780 void
00781 rb_free_tmp_buffer(volatile VALUE *store)
00782 {
00783 VALUE s = *store;
00784 *store = 0;
00785 if (s) rb_str_clear(s);
00786 }
00787
00788 void
00789 rb_str_free(VALUE str)
00790 {
00791 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00792 xfree(RSTRING(str)->as.heap.ptr);
00793 }
00794 }
00795
00796 RUBY_FUNC_EXPORTED size_t
00797 rb_str_memsize(VALUE str)
00798 {
00799 if (!STR_EMBED_P(str) && !STR_SHARED_P(str)) {
00800 return RSTRING(str)->as.heap.aux.capa;
00801 }
00802 else {
00803 return 0;
00804 }
00805 }
00806
00807 VALUE
00808 rb_str_to_str(VALUE str)
00809 {
00810 return rb_convert_type(str, T_STRING, "String", "to_str");
00811 }
00812
00813 static inline void str_discard(VALUE str);
00814
00815 void
00816 rb_str_shared_replace(VALUE str, VALUE str2)
00817 {
00818 rb_encoding *enc;
00819 int cr;
00820 if (str == str2) return;
00821 enc = STR_ENC_GET(str2);
00822 cr = ENC_CODERANGE(str2);
00823 str_discard(str);
00824 OBJ_INFECT(str, str2);
00825 if (RSTRING_LEN(str2) <= RSTRING_EMBED_LEN_MAX) {
00826 STR_SET_EMBED(str);
00827 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), RSTRING_LEN(str2)+1);
00828 STR_SET_EMBED_LEN(str, RSTRING_LEN(str2));
00829 rb_enc_associate(str, enc);
00830 ENC_CODERANGE_SET(str, cr);
00831 return;
00832 }
00833 STR_SET_NOEMBED(str);
00834 STR_UNSET_NOCAPA(str);
00835 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00836 RSTRING(str)->as.heap.len = RSTRING_LEN(str2);
00837 if (STR_NOCAPA_P(str2)) {
00838 FL_SET(str, RBASIC(str2)->flags & STR_NOCAPA);
00839 RSTRING(str)->as.heap.aux.shared = RSTRING(str2)->as.heap.aux.shared;
00840 }
00841 else {
00842 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
00843 }
00844 STR_SET_EMBED(str2);
00845 RSTRING_PTR(str2)[0] = 0;
00846 STR_SET_EMBED_LEN(str2, 0);
00847 rb_enc_associate(str, enc);
00848 ENC_CODERANGE_SET(str, cr);
00849 }
00850
00851 static ID id_to_s;
00852
00853 VALUE
00854 rb_obj_as_string(VALUE obj)
00855 {
00856 VALUE str;
00857
00858 if (TYPE(obj) == T_STRING) {
00859 return obj;
00860 }
00861 str = rb_funcall(obj, id_to_s, 0);
00862 if (TYPE(str) != T_STRING)
00863 return rb_any_to_s(obj);
00864 if (OBJ_TAINTED(obj)) OBJ_TAINT(str);
00865 return str;
00866 }
00867
00868 static VALUE
00869 str_replace(VALUE str, VALUE str2)
00870 {
00871 long len;
00872
00873 len = RSTRING_LEN(str2);
00874 if (STR_ASSOC_P(str2)) {
00875 str2 = rb_str_new4(str2);
00876 }
00877 if (STR_SHARED_P(str2)) {
00878 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
00879 assert(OBJ_FROZEN(shared));
00880 STR_SET_NOEMBED(str);
00881 RSTRING(str)->as.heap.len = len;
00882 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
00883 FL_SET(str, ELTS_SHARED);
00884 FL_UNSET(str, STR_ASSOC);
00885 RSTRING(str)->as.heap.aux.shared = shared;
00886 }
00887 else {
00888 str_replace_shared(str, str2);
00889 }
00890
00891 OBJ_INFECT(str, str2);
00892 rb_enc_cr_str_exact_copy(str, str2);
00893 return str;
00894 }
00895
00896 static VALUE
00897 str_duplicate(VALUE klass, VALUE str)
00898 {
00899 VALUE dup = str_alloc(klass);
00900 str_replace(dup, str);
00901 return dup;
00902 }
00903
00904 VALUE
00905 rb_str_dup(VALUE str)
00906 {
00907 return str_duplicate(rb_obj_class(str), str);
00908 }
00909
00910 VALUE
00911 rb_str_resurrect(VALUE str)
00912 {
00913 return str_replace(str_alloc(rb_cString), str);
00914 }
00915
00916
00917
00918
00919
00920
00921
00922
00923 static VALUE
00924 rb_str_init(int argc, VALUE *argv, VALUE str)
00925 {
00926 VALUE orig;
00927
00928 if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1)
00929 rb_str_replace(str, orig);
00930 return str;
00931 }
00932
00933 static inline long
00934 enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
00935 {
00936 long c;
00937 const char *q;
00938
00939 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00940 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00941 }
00942 else if (rb_enc_asciicompat(enc)) {
00943 c = 0;
00944 if (cr == ENC_CODERANGE_7BIT || cr == ENC_CODERANGE_VALID) {
00945 while (p < e) {
00946 if (ISASCII(*p)) {
00947 q = search_nonascii(p, e);
00948 if (!q)
00949 return c + (e - p);
00950 c += q - p;
00951 p = q;
00952 }
00953 p += rb_enc_fast_mbclen(p, e, enc);
00954 c++;
00955 }
00956 }
00957 else {
00958 while (p < e) {
00959 if (ISASCII(*p)) {
00960 q = search_nonascii(p, e);
00961 if (!q)
00962 return c + (e - p);
00963 c += q - p;
00964 p = q;
00965 }
00966 p += rb_enc_mbclen(p, e, enc);
00967 c++;
00968 }
00969 }
00970 return c;
00971 }
00972
00973 for (c=0; p<e; c++) {
00974 p += rb_enc_mbclen(p, e, enc);
00975 }
00976 return c;
00977 }
00978
00979 long
00980 rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
00981 {
00982 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
00983 }
00984
00985 long
00986 rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
00987 {
00988 long c;
00989 const char *q;
00990 int ret;
00991
00992 *cr = 0;
00993 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
00994 return (e - p + rb_enc_mbminlen(enc) - 1) / rb_enc_mbminlen(enc);
00995 }
00996 else if (rb_enc_asciicompat(enc)) {
00997 c = 0;
00998 while (p < e) {
00999 if (ISASCII(*p)) {
01000 q = search_nonascii(p, e);
01001 if (!q) {
01002 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01003 return c + (e - p);
01004 }
01005 c += q - p;
01006 p = q;
01007 }
01008 ret = rb_enc_precise_mbclen(p, e, enc);
01009 if (MBCLEN_CHARFOUND_P(ret)) {
01010 *cr |= ENC_CODERANGE_VALID;
01011 p += MBCLEN_CHARFOUND_LEN(ret);
01012 }
01013 else {
01014 *cr = ENC_CODERANGE_BROKEN;
01015 p++;
01016 }
01017 c++;
01018 }
01019 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01020 return c;
01021 }
01022
01023 for (c=0; p<e; c++) {
01024 ret = rb_enc_precise_mbclen(p, e, enc);
01025 if (MBCLEN_CHARFOUND_P(ret)) {
01026 *cr |= ENC_CODERANGE_VALID;
01027 p += MBCLEN_CHARFOUND_LEN(ret);
01028 }
01029 else {
01030 *cr = ENC_CODERANGE_BROKEN;
01031 if (p + rb_enc_mbminlen(enc) <= e)
01032 p += rb_enc_mbminlen(enc);
01033 else
01034 p = e;
01035 }
01036 }
01037 if (!*cr) *cr = ENC_CODERANGE_7BIT;
01038 return c;
01039 }
01040
01041 #ifdef NONASCII_MASK
01042 #define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
01043
01044
01045
01046
01047
01048
01049
01050
01051
01052
01053
01054
01055
01056 static inline VALUE
01057 count_utf8_lead_bytes_with_word(const VALUE *s)
01058 {
01059 VALUE d = *s;
01060
01061
01062 d |= ~(d>>1);
01063 d >>= 6;
01064 d &= NONASCII_MASK >> 7;
01065
01066
01067 d += (d>>8);
01068 d += (d>>16);
01069 #if SIZEOF_VALUE == 8
01070 d += (d>>32);
01071 #endif
01072 return (d&0xF);
01073 }
01074 #endif
01075
01076 static long
01077 str_strlen(VALUE str, rb_encoding *enc)
01078 {
01079 const char *p, *e;
01080 long n;
01081 int cr;
01082
01083 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
01084 if (!enc) enc = STR_ENC_GET(str);
01085 p = RSTRING_PTR(str);
01086 e = RSTRING_END(str);
01087 cr = ENC_CODERANGE(str);
01088 #ifdef NONASCII_MASK
01089 if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01090 enc == rb_utf8_encoding()) {
01091
01092 VALUE len = 0;
01093 if ((int)sizeof(VALUE) * 2 < e - p) {
01094 const VALUE *s, *t;
01095 const VALUE lowbits = sizeof(VALUE) - 1;
01096 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01097 t = (const VALUE*)(~lowbits & (VALUE)e);
01098 while (p < (const char *)s) {
01099 if (is_utf8_lead_byte(*p)) len++;
01100 p++;
01101 }
01102 while (s < t) {
01103 len += count_utf8_lead_bytes_with_word(s);
01104 s++;
01105 }
01106 p = (const char *)s;
01107 }
01108 while (p < e) {
01109 if (is_utf8_lead_byte(*p)) len++;
01110 p++;
01111 }
01112 return (long)len;
01113 }
01114 #endif
01115 n = rb_enc_strlen_cr(p, e, enc, &cr);
01116 if (cr) {
01117 ENC_CODERANGE_SET(str, cr);
01118 }
01119 return n;
01120 }
01121
01122 long
01123 rb_str_strlen(VALUE str)
01124 {
01125 return str_strlen(str, STR_ENC_GET(str));
01126 }
01127
01128
01129
01130
01131
01132
01133
01134
01135
01136 VALUE
01137 rb_str_length(VALUE str)
01138 {
01139 long len;
01140
01141 len = str_strlen(str, STR_ENC_GET(str));
01142 return LONG2NUM(len);
01143 }
01144
01145
01146
01147
01148
01149
01150
01151
01152 static VALUE
01153 rb_str_bytesize(VALUE str)
01154 {
01155 return LONG2NUM(RSTRING_LEN(str));
01156 }
01157
01158
01159
01160
01161
01162
01163
01164
01165
01166
01167
01168 static VALUE
01169 rb_str_empty(VALUE str)
01170 {
01171 if (RSTRING_LEN(str) == 0)
01172 return Qtrue;
01173 return Qfalse;
01174 }
01175
01176
01177
01178
01179
01180
01181
01182
01183
01184
01185
01186 VALUE
01187 rb_str_plus(VALUE str1, VALUE str2)
01188 {
01189 VALUE str3;
01190 rb_encoding *enc;
01191
01192 StringValue(str2);
01193 enc = rb_enc_check(str1, str2);
01194 str3 = rb_str_new(0, RSTRING_LEN(str1)+RSTRING_LEN(str2));
01195 memcpy(RSTRING_PTR(str3), RSTRING_PTR(str1), RSTRING_LEN(str1));
01196 memcpy(RSTRING_PTR(str3) + RSTRING_LEN(str1),
01197 RSTRING_PTR(str2), RSTRING_LEN(str2));
01198 RSTRING_PTR(str3)[RSTRING_LEN(str3)] = '\0';
01199
01200 if (OBJ_TAINTED(str1) || OBJ_TAINTED(str2))
01201 OBJ_TAINT(str3);
01202 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
01203 ENC_CODERANGE_AND(ENC_CODERANGE(str1), ENC_CODERANGE(str2)));
01204 return str3;
01205 }
01206
01207
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217 VALUE
01218 rb_str_times(VALUE str, VALUE times)
01219 {
01220 VALUE str2;
01221 long n, len;
01222 char *ptr2;
01223
01224 len = NUM2LONG(times);
01225 if (len < 0) {
01226 rb_raise(rb_eArgError, "negative argument");
01227 }
01228 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
01229 rb_raise(rb_eArgError, "argument too big");
01230 }
01231
01232 str2 = rb_str_new5(str, 0, len *= RSTRING_LEN(str));
01233 ptr2 = RSTRING_PTR(str2);
01234 if (len) {
01235 n = RSTRING_LEN(str);
01236 memcpy(ptr2, RSTRING_PTR(str), n);
01237 while (n <= len/2) {
01238 memcpy(ptr2 + n, ptr2, n);
01239 n *= 2;
01240 }
01241 memcpy(ptr2 + n, ptr2, len-n);
01242 }
01243 ptr2[RSTRING_LEN(str2)] = '\0';
01244 OBJ_INFECT(str2, str);
01245 rb_enc_cr_str_copy_for_substr(str2, str);
01246
01247 return str2;
01248 }
01249
01250
01251
01252
01253
01254
01255
01256
01257
01258
01259
01260
01261
01262
01263
01264
01265 static VALUE
01266 rb_str_format_m(VALUE str, VALUE arg)
01267 {
01268 volatile VALUE tmp = rb_check_array_type(arg);
01269
01270 if (!NIL_P(tmp)) {
01271 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
01272 }
01273 return rb_str_format(1, &arg, str);
01274 }
01275
01276 static inline void
01277 str_modifiable(VALUE str)
01278 {
01279 if (FL_TEST(str, STR_TMPLOCK)) {
01280 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
01281 }
01282 rb_check_frozen(str);
01283 if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4)
01284 rb_raise(rb_eSecurityError, "Insecure: can't modify string");
01285 }
01286
01287 static inline int
01288 str_independent(VALUE str)
01289 {
01290 str_modifiable(str);
01291 if (!STR_SHARED_P(str)) return 1;
01292 if (STR_EMBED_P(str)) return 1;
01293 return 0;
01294 }
01295
01296 static void
01297 str_make_independent_expand(VALUE str, long expand)
01298 {
01299 char *ptr;
01300 long len = RSTRING_LEN(str);
01301 long capa = len + expand;
01302
01303 if (len > capa) len = capa;
01304 ptr = ALLOC_N(char, capa + 1);
01305 if (RSTRING_PTR(str)) {
01306 memcpy(ptr, RSTRING_PTR(str), len);
01307 }
01308 STR_SET_NOEMBED(str);
01309 STR_UNSET_NOCAPA(str);
01310 ptr[len] = 0;
01311 RSTRING(str)->as.heap.ptr = ptr;
01312 RSTRING(str)->as.heap.len = len;
01313 RSTRING(str)->as.heap.aux.capa = capa;
01314 }
01315
01316 #define str_make_independent(str) str_make_independent_expand((str), 0L)
01317
01318 void
01319 rb_str_modify(VALUE str)
01320 {
01321 if (!str_independent(str))
01322 str_make_independent(str);
01323 ENC_CODERANGE_CLEAR(str);
01324 }
01325
01326 void
01327 rb_str_modify_expand(VALUE str, long expand)
01328 {
01329 if (expand < 0) {
01330 rb_raise(rb_eArgError, "negative expanding string size");
01331 }
01332 if (!str_independent(str)) {
01333 str_make_independent_expand(str, expand);
01334 }
01335 else if (expand > 0) {
01336 long len = RSTRING_LEN(str);
01337 long capa = len + expand;
01338 if (!STR_EMBED_P(str)) {
01339 REALLOC_N(RSTRING(str)->as.heap.ptr, char, capa+1);
01340 RSTRING(str)->as.heap.aux.capa = capa;
01341 }
01342 else if (capa > RSTRING_EMBED_LEN_MAX) {
01343 str_make_independent_expand(str, expand);
01344 }
01345 }
01346 ENC_CODERANGE_CLEAR(str);
01347 }
01348
01349
01350 static void
01351 str_modify_keep_cr(VALUE str)
01352 {
01353 if (!str_independent(str))
01354 str_make_independent(str);
01355 if (ENC_CODERANGE(str) == ENC_CODERANGE_BROKEN)
01356
01357 ENC_CODERANGE_CLEAR(str);
01358 }
01359
01360 static inline void
01361 str_discard(VALUE str)
01362 {
01363 str_modifiable(str);
01364 if (!STR_SHARED_P(str) && !STR_EMBED_P(str)) {
01365 xfree(RSTRING_PTR(str));
01366 RSTRING(str)->as.heap.ptr = 0;
01367 RSTRING(str)->as.heap.len = 0;
01368 }
01369 }
01370
01371 void
01372 rb_str_associate(VALUE str, VALUE add)
01373 {
01374
01375 rb_check_frozen(str);
01376 if (STR_ASSOC_P(str)) {
01377
01378 rb_ary_concat(RSTRING(str)->as.heap.aux.shared, add);
01379 }
01380 else {
01381 if (STR_SHARED_P(str)) {
01382 VALUE assoc = RSTRING(str)->as.heap.aux.shared;
01383 str_make_independent(str);
01384 if (STR_ASSOC_P(assoc)) {
01385 assoc = RSTRING(assoc)->as.heap.aux.shared;
01386 rb_ary_concat(assoc, add);
01387 add = assoc;
01388 }
01389 }
01390 else if (STR_EMBED_P(str)) {
01391 str_make_independent(str);
01392 }
01393 else if (RSTRING(str)->as.heap.aux.capa != RSTRING_LEN(str)) {
01394 RESIZE_CAPA(str, RSTRING_LEN(str));
01395 }
01396 FL_SET(str, STR_ASSOC);
01397 RBASIC(add)->klass = 0;
01398 RSTRING(str)->as.heap.aux.shared = add;
01399 }
01400 }
01401
01402 VALUE
01403 rb_str_associated(VALUE str)
01404 {
01405 if (STR_SHARED_P(str)) str = RSTRING(str)->as.heap.aux.shared;
01406 if (STR_ASSOC_P(str)) {
01407 return RSTRING(str)->as.heap.aux.shared;
01408 }
01409 return Qfalse;
01410 }
01411
01412 VALUE
01413 rb_string_value(volatile VALUE *ptr)
01414 {
01415 VALUE s = *ptr;
01416 if (TYPE(s) != T_STRING) {
01417 s = rb_str_to_str(s);
01418 *ptr = s;
01419 }
01420 return s;
01421 }
01422
01423 char *
01424 rb_string_value_ptr(volatile VALUE *ptr)
01425 {
01426 VALUE str = rb_string_value(ptr);
01427 return RSTRING_PTR(str);
01428 }
01429
01430 char *
01431 rb_string_value_cstr(volatile VALUE *ptr)
01432 {
01433 VALUE str = rb_string_value(ptr);
01434 char *s = RSTRING_PTR(str);
01435 long len = RSTRING_LEN(str);
01436
01437 if (!s || memchr(s, 0, len)) {
01438 rb_raise(rb_eArgError, "string contains null byte");
01439 }
01440 if (s[len]) {
01441 rb_str_modify(str);
01442 s = RSTRING_PTR(str);
01443 s[RSTRING_LEN(str)] = 0;
01444 }
01445 return s;
01446 }
01447
01448 VALUE
01449 rb_check_string_type(VALUE str)
01450 {
01451 str = rb_check_convert_type(str, T_STRING, "String", "to_str");
01452 return str;
01453 }
01454
01455
01456
01457
01458
01459
01460
01461
01462
01463
01464
01465
01466 static VALUE
01467 rb_str_s_try_convert(VALUE dummy, VALUE str)
01468 {
01469 return rb_check_string_type(str);
01470 }
01471
01472 static char*
01473 str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
01474 {
01475 long nth = *nthp;
01476 if (rb_enc_mbmaxlen(enc) == 1) {
01477 p += nth;
01478 }
01479 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01480 p += nth * rb_enc_mbmaxlen(enc);
01481 }
01482 else if (rb_enc_asciicompat(enc)) {
01483 const char *p2, *e2;
01484 int n;
01485
01486 while (p < e && 0 < nth) {
01487 e2 = p + nth;
01488 if (e < e2) {
01489 *nthp = nth;
01490 return (char *)e;
01491 }
01492 if (ISASCII(*p)) {
01493 p2 = search_nonascii(p, e2);
01494 if (!p2) {
01495 *nthp = nth;
01496 return (char *)e2;
01497 }
01498 nth -= p2 - p;
01499 p = p2;
01500 }
01501 n = rb_enc_mbclen(p, e, enc);
01502 p += n;
01503 nth--;
01504 }
01505 *nthp = nth;
01506 if (nth != 0) {
01507 return (char *)e;
01508 }
01509 return (char *)p;
01510 }
01511 else {
01512 while (p < e && nth--) {
01513 p += rb_enc_mbclen(p, e, enc);
01514 }
01515 }
01516 if (p > e) p = e;
01517 *nthp = nth;
01518 return (char*)p;
01519 }
01520
01521 char*
01522 rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
01523 {
01524 return str_nth_len(p, e, &nth, enc);
01525 }
01526
01527 static char*
01528 str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01529 {
01530 if (singlebyte)
01531 p += nth;
01532 else {
01533 p = str_nth_len(p, e, &nth, enc);
01534 }
01535 if (!p) return 0;
01536 if (p > e) p = e;
01537 return (char *)p;
01538 }
01539
01540
01541 static long
01542 str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
01543 {
01544 const char *pp = str_nth(p, e, nth, enc, singlebyte);
01545 if (!pp) return e - p;
01546 return pp - p;
01547 }
01548
01549 long
01550 rb_str_offset(VALUE str, long pos)
01551 {
01552 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
01553 STR_ENC_GET(str), single_byte_optimizable(str));
01554 }
01555
01556 #ifdef NONASCII_MASK
01557 static char *
01558 str_utf8_nth(const char *p, const char *e, long *nthp)
01559 {
01560 long nth = *nthp;
01561 if ((int)SIZEOF_VALUE * 2 < e - p && (int)SIZEOF_VALUE * 2 < nth) {
01562 const VALUE *s, *t;
01563 const VALUE lowbits = sizeof(VALUE) - 1;
01564 s = (const VALUE*)(~lowbits & ((VALUE)p + lowbits));
01565 t = (const VALUE*)(~lowbits & (VALUE)e);
01566 while (p < (const char *)s) {
01567 if (is_utf8_lead_byte(*p)) nth--;
01568 p++;
01569 }
01570 do {
01571 nth -= count_utf8_lead_bytes_with_word(s);
01572 s++;
01573 } while (s < t && (int)sizeof(VALUE) <= nth);
01574 p = (char *)s;
01575 }
01576 while (p < e) {
01577 if (is_utf8_lead_byte(*p)) {
01578 if (nth == 0) break;
01579 nth--;
01580 }
01581 p++;
01582 }
01583 *nthp = nth;
01584 return (char *)p;
01585 }
01586
01587 static long
01588 str_utf8_offset(const char *p, const char *e, long nth)
01589 {
01590 const char *pp = str_utf8_nth(p, e, &nth);
01591 return pp - p;
01592 }
01593 #endif
01594
01595
01596 long
01597 rb_str_sublen(VALUE str, long pos)
01598 {
01599 if (single_byte_optimizable(str) || pos < 0)
01600 return pos;
01601 else {
01602 char *p = RSTRING_PTR(str);
01603 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
01604 }
01605 }
01606
01607 VALUE
01608 rb_str_subseq(VALUE str, long beg, long len)
01609 {
01610 VALUE str2;
01611
01612 if (RSTRING_LEN(str) == beg + len &&
01613 RSTRING_EMBED_LEN_MAX < len) {
01614 str2 = rb_str_new_shared(rb_str_new_frozen(str));
01615 rb_str_drop_bytes(str2, beg);
01616 }
01617 else {
01618 str2 = rb_str_new5(str, RSTRING_PTR(str)+beg, len);
01619 }
01620
01621 rb_enc_cr_str_copy_for_substr(str2, str);
01622 OBJ_INFECT(str2, str);
01623
01624 return str2;
01625 }
01626
01627 VALUE
01628 rb_str_substr(VALUE str, long beg, long len)
01629 {
01630 rb_encoding *enc = STR_ENC_GET(str);
01631 VALUE str2;
01632 char *p, *s = RSTRING_PTR(str), *e = s + RSTRING_LEN(str);
01633
01634 if (len < 0) return Qnil;
01635 if (!RSTRING_LEN(str)) {
01636 len = 0;
01637 }
01638 if (single_byte_optimizable(str)) {
01639 if (beg > RSTRING_LEN(str)) return Qnil;
01640 if (beg < 0) {
01641 beg += RSTRING_LEN(str);
01642 if (beg < 0) return Qnil;
01643 }
01644 if (beg + len > RSTRING_LEN(str))
01645 len = RSTRING_LEN(str) - beg;
01646 if (len <= 0) {
01647 len = 0;
01648 p = 0;
01649 }
01650 else
01651 p = s + beg;
01652 goto sub;
01653 }
01654 if (beg < 0) {
01655 if (len > -beg) len = -beg;
01656 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
01657 beg = -beg;
01658 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
01659 p = e;
01660 if (!p) return Qnil;
01661 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
01662 if (!p) return Qnil;
01663 len = e - p;
01664 goto sub;
01665 }
01666 else {
01667 beg += str_strlen(str, enc);
01668 if (beg < 0) return Qnil;
01669 }
01670 }
01671 else if (beg > 0 && beg > RSTRING_LEN(str)) {
01672 return Qnil;
01673 }
01674 if (len == 0) {
01675 if (beg > str_strlen(str, enc)) return Qnil;
01676 p = 0;
01677 }
01678 #ifdef NONASCII_MASK
01679 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
01680 enc == rb_utf8_encoding()) {
01681 p = str_utf8_nth(s, e, &beg);
01682 if (beg > 0) return Qnil;
01683 len = str_utf8_offset(p, e, len);
01684 }
01685 #endif
01686 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
01687 int char_sz = rb_enc_mbmaxlen(enc);
01688
01689 p = s + beg * char_sz;
01690 if (p > e) {
01691 return Qnil;
01692 }
01693 else if (len * char_sz > e - p)
01694 len = e - p;
01695 else
01696 len *= char_sz;
01697 }
01698 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
01699 if (beg > 0) return Qnil;
01700 len = 0;
01701 }
01702 else {
01703 len = str_offset(p, e, len, enc, 0);
01704 }
01705 sub:
01706 if (len > RSTRING_EMBED_LEN_MAX && beg + len == RSTRING_LEN(str)) {
01707 str2 = rb_str_new4(str);
01708 str2 = str_new3(rb_obj_class(str2), str2);
01709 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
01710 RSTRING(str2)->as.heap.len = len;
01711 }
01712 else {
01713 str2 = rb_str_new5(str, p, len);
01714 rb_enc_cr_str_copy_for_substr(str2, str);
01715 OBJ_INFECT(str2, str);
01716 }
01717
01718 return str2;
01719 }
01720
01721 VALUE
01722 rb_str_freeze(VALUE str)
01723 {
01724 if (STR_ASSOC_P(str)) {
01725 VALUE ary = RSTRING(str)->as.heap.aux.shared;
01726 OBJ_FREEZE(ary);
01727 }
01728 return rb_obj_freeze(str);
01729 }
01730
01731 RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
01732 #define rb_str_dup_frozen rb_str_new_frozen
01733
01734 VALUE
01735 rb_str_locktmp(VALUE str)
01736 {
01737 if (FL_TEST(str, STR_TMPLOCK)) {
01738 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
01739 }
01740 FL_SET(str, STR_TMPLOCK);
01741 return str;
01742 }
01743
01744 VALUE
01745 rb_str_unlocktmp(VALUE str)
01746 {
01747 if (!FL_TEST(str, STR_TMPLOCK)) {
01748 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
01749 }
01750 FL_UNSET(str, STR_TMPLOCK);
01751 return str;
01752 }
01753
01754 void
01755 rb_str_set_len(VALUE str, long len)
01756 {
01757 long capa;
01758
01759 str_modifiable(str);
01760 if (STR_SHARED_P(str)) {
01761 rb_raise(rb_eRuntimeError, "can't set length of shared string");
01762 }
01763 if (len > (capa = (long)rb_str_capacity(str))) {
01764 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
01765 }
01766 STR_SET_LEN(str, len);
01767 RSTRING_PTR(str)[len] = '\0';
01768 }
01769
01770 VALUE
01771 rb_str_resize(VALUE str, long len)
01772 {
01773 long slen;
01774 int independent;
01775
01776 if (len < 0) {
01777 rb_raise(rb_eArgError, "negative string size (or size too big)");
01778 }
01779
01780 independent = str_independent(str);
01781 ENC_CODERANGE_CLEAR(str);
01782 slen = RSTRING_LEN(str);
01783 if (len != slen) {
01784 if (STR_EMBED_P(str)) {
01785 if (len <= RSTRING_EMBED_LEN_MAX) {
01786 STR_SET_EMBED_LEN(str, len);
01787 RSTRING(str)->as.ary[len] = '\0';
01788 return str;
01789 }
01790 str_make_independent_expand(str, len - slen);
01791 STR_SET_NOEMBED(str);
01792 }
01793 else if (len <= RSTRING_EMBED_LEN_MAX) {
01794 char *ptr = RSTRING(str)->as.heap.ptr;
01795 STR_SET_EMBED(str);
01796 if (slen > len) slen = len;
01797 if (slen > 0) MEMCPY(RSTRING(str)->as.ary, ptr, char, slen);
01798 RSTRING(str)->as.ary[len] = '\0';
01799 STR_SET_EMBED_LEN(str, len);
01800 if (independent) xfree(ptr);
01801 return str;
01802 }
01803 else if (!independent) {
01804 str_make_independent_expand(str, len - slen);
01805 }
01806 else if (slen < len || slen - len > 1024) {
01807 REALLOC_N(RSTRING(str)->as.heap.ptr, char, len+1);
01808 }
01809 if (!STR_NOCAPA_P(str)) {
01810 RSTRING(str)->as.heap.aux.capa = len;
01811 }
01812 RSTRING(str)->as.heap.len = len;
01813 RSTRING(str)->as.heap.ptr[len] = '\0';
01814 }
01815 return str;
01816 }
01817
01818 static VALUE
01819 str_buf_cat(VALUE str, const char *ptr, long len)
01820 {
01821 long capa, total, off = -1;
01822
01823 if (ptr >= RSTRING_PTR(str) && ptr <= RSTRING_END(str)) {
01824 off = ptr - RSTRING_PTR(str);
01825 }
01826 rb_str_modify(str);
01827 if (len == 0) return 0;
01828 if (STR_ASSOC_P(str)) {
01829 FL_UNSET(str, STR_ASSOC);
01830 capa = RSTRING(str)->as.heap.aux.capa = RSTRING_LEN(str);
01831 }
01832 else if (STR_EMBED_P(str)) {
01833 capa = RSTRING_EMBED_LEN_MAX;
01834 }
01835 else {
01836 capa = RSTRING(str)->as.heap.aux.capa;
01837 }
01838 if (RSTRING_LEN(str) >= LONG_MAX - len) {
01839 rb_raise(rb_eArgError, "string sizes too big");
01840 }
01841 total = RSTRING_LEN(str)+len;
01842 if (capa <= total) {
01843 while (total > capa) {
01844 if (capa + 1 >= LONG_MAX / 2) {
01845 capa = (total + 4095) / 4096;
01846 break;
01847 }
01848 capa = (capa + 1) * 2;
01849 }
01850 RESIZE_CAPA(str, capa);
01851 }
01852 if (off != -1) {
01853 ptr = RSTRING_PTR(str) + off;
01854 }
01855 memcpy(RSTRING_PTR(str) + RSTRING_LEN(str), ptr, len);
01856 STR_SET_LEN(str, total);
01857 RSTRING_PTR(str)[total] = '\0';
01858
01859 return str;
01860 }
01861
01862 #define str_buf_cat2(str, ptr) str_buf_cat((str), (ptr), strlen(ptr))
01863
01864 VALUE
01865 rb_str_buf_cat(VALUE str, const char *ptr, long len)
01866 {
01867 if (len == 0) return str;
01868 if (len < 0) {
01869 rb_raise(rb_eArgError, "negative string size (or size too big)");
01870 }
01871 return str_buf_cat(str, ptr, len);
01872 }
01873
01874 VALUE
01875 rb_str_buf_cat2(VALUE str, const char *ptr)
01876 {
01877 return rb_str_buf_cat(str, ptr, strlen(ptr));
01878 }
01879
01880 VALUE
01881 rb_str_cat(VALUE str, const char *ptr, long len)
01882 {
01883 if (len < 0) {
01884 rb_raise(rb_eArgError, "negative string size (or size too big)");
01885 }
01886 if (STR_ASSOC_P(str)) {
01887 char *p;
01888 rb_str_modify_expand(str, len);
01889 p = RSTRING(str)->as.heap.ptr;
01890 memcpy(p + RSTRING(str)->as.heap.len, ptr, len);
01891 len = RSTRING(str)->as.heap.len += len;
01892 p[len] = '\0';
01893 return str;
01894 }
01895
01896 return rb_str_buf_cat(str, ptr, len);
01897 }
01898
01899 VALUE
01900 rb_str_cat2(VALUE str, const char *ptr)
01901 {
01902 return rb_str_cat(str, ptr, strlen(ptr));
01903 }
01904
01905 static VALUE
01906 rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
01907 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
01908 {
01909 int str_encindex = ENCODING_GET(str);
01910 int res_encindex;
01911 int str_cr, res_cr;
01912
01913 str_cr = ENC_CODERANGE(str);
01914
01915 if (str_encindex == ptr_encindex) {
01916 if (str_cr == ENC_CODERANGE_UNKNOWN)
01917 ptr_cr = ENC_CODERANGE_UNKNOWN;
01918 else if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01919 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
01920 }
01921 }
01922 else {
01923 rb_encoding *str_enc = rb_enc_from_index(str_encindex);
01924 rb_encoding *ptr_enc = rb_enc_from_index(ptr_encindex);
01925 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
01926 if (len == 0)
01927 return str;
01928 if (RSTRING_LEN(str) == 0) {
01929 rb_str_buf_cat(str, ptr, len);
01930 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
01931 return str;
01932 }
01933 goto incompatible;
01934 }
01935 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
01936 ptr_cr = coderange_scan(ptr, len, ptr_enc);
01937 }
01938 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01939 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
01940 str_cr = rb_enc_str_coderange(str);
01941 }
01942 }
01943 }
01944 if (ptr_cr_ret)
01945 *ptr_cr_ret = ptr_cr;
01946
01947 if (str_encindex != ptr_encindex &&
01948 str_cr != ENC_CODERANGE_7BIT &&
01949 ptr_cr != ENC_CODERANGE_7BIT) {
01950 incompatible:
01951 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
01952 rb_enc_name(rb_enc_from_index(str_encindex)),
01953 rb_enc_name(rb_enc_from_index(ptr_encindex)));
01954 }
01955
01956 if (str_cr == ENC_CODERANGE_UNKNOWN) {
01957 res_encindex = str_encindex;
01958 res_cr = ENC_CODERANGE_UNKNOWN;
01959 }
01960 else if (str_cr == ENC_CODERANGE_7BIT) {
01961 if (ptr_cr == ENC_CODERANGE_7BIT) {
01962 res_encindex = str_encindex;
01963 res_cr = ENC_CODERANGE_7BIT;
01964 }
01965 else {
01966 res_encindex = ptr_encindex;
01967 res_cr = ptr_cr;
01968 }
01969 }
01970 else if (str_cr == ENC_CODERANGE_VALID) {
01971 res_encindex = str_encindex;
01972 if (ptr_cr == ENC_CODERANGE_7BIT || ptr_cr == ENC_CODERANGE_VALID)
01973 res_cr = str_cr;
01974 else
01975 res_cr = ptr_cr;
01976 }
01977 else {
01978 res_encindex = str_encindex;
01979 res_cr = str_cr;
01980 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
01981 }
01982
01983 if (len < 0) {
01984 rb_raise(rb_eArgError, "negative string size (or size too big)");
01985 }
01986 str_buf_cat(str, ptr, len);
01987 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
01988 return str;
01989 }
01990
01991 VALUE
01992 rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
01993 {
01994 return rb_enc_cr_str_buf_cat(str, ptr, len,
01995 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
01996 }
01997
01998 VALUE
01999 rb_str_buf_cat_ascii(VALUE str, const char *ptr)
02000 {
02001
02002 int encindex = ENCODING_GET(str);
02003 rb_encoding *enc = rb_enc_from_index(encindex);
02004 if (rb_enc_asciicompat(enc)) {
02005 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
02006 encindex, ENC_CODERANGE_7BIT, 0);
02007 }
02008 else {
02009 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
02010 while (*ptr) {
02011 unsigned int c = (unsigned char)*ptr;
02012 int len = rb_enc_codelen(c, enc);
02013 rb_enc_mbcput(c, buf, enc);
02014 rb_enc_cr_str_buf_cat(str, buf, len,
02015 encindex, ENC_CODERANGE_VALID, 0);
02016 ptr++;
02017 }
02018 return str;
02019 }
02020 }
02021
02022 VALUE
02023 rb_str_buf_append(VALUE str, VALUE str2)
02024 {
02025 int str2_cr;
02026
02027 str2_cr = ENC_CODERANGE(str2);
02028
02029 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
02030 ENCODING_GET(str2), str2_cr, &str2_cr);
02031
02032 OBJ_INFECT(str, str2);
02033 ENC_CODERANGE_SET(str2, str2_cr);
02034
02035 return str;
02036 }
02037
02038 VALUE
02039 rb_str_append(VALUE str, VALUE str2)
02040 {
02041 rb_encoding *enc;
02042 int cr, cr2;
02043 long len2;
02044
02045 StringValue(str2);
02046 if ((len2 = RSTRING_LEN(str2)) > 0 && STR_ASSOC_P(str)) {
02047 long len = RSTRING_LEN(str) + len2;
02048 enc = rb_enc_check(str, str2);
02049 cr = ENC_CODERANGE(str);
02050 if ((cr2 = ENC_CODERANGE(str2)) > cr) cr = cr2;
02051 rb_str_modify_expand(str, len2);
02052 memcpy(RSTRING(str)->as.heap.ptr + RSTRING(str)->as.heap.len,
02053 RSTRING_PTR(str2), len2+1);
02054 RSTRING(str)->as.heap.len = len;
02055 rb_enc_associate(str, enc);
02056 ENC_CODERANGE_SET(str, cr);
02057 OBJ_INFECT(str, str2);
02058 return str;
02059 }
02060 return rb_str_buf_append(str, str2);
02061 }
02062
02063
02064
02065
02066
02067
02068
02069
02070
02071
02072
02073
02074
02075
02076
02077
02078
02079 VALUE
02080 rb_str_concat(VALUE str1, VALUE str2)
02081 {
02082 unsigned int code;
02083 rb_encoding *enc = STR_ENC_GET(str1);
02084
02085 if (FIXNUM_P(str2) || TYPE(str2) == T_BIGNUM) {
02086 if (rb_num_to_uint(str2, &code) == 0) {
02087 }
02088 else if (FIXNUM_P(str2)) {
02089 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
02090 }
02091 else {
02092 rb_raise(rb_eRangeError, "bignum out of char range");
02093 }
02094 }
02095 else {
02096 return rb_str_append(str1, str2);
02097 }
02098
02099 if (enc == rb_usascii_encoding()) {
02100
02101 char buf[1] = {(char)code};
02102 if (code > 0xFF) {
02103 rb_raise(rb_eRangeError, "%u out of char range", code);
02104 }
02105 rb_str_cat(str1, buf, 1);
02106 if (code > 127) {
02107 rb_enc_associate(str1, rb_ascii8bit_encoding());
02108 ENC_CODERANGE_SET(str1, ENC_CODERANGE_VALID);
02109 }
02110 }
02111 else {
02112 long pos = RSTRING_LEN(str1);
02113 int cr = ENC_CODERANGE(str1);
02114 int len;
02115 char *buf;
02116
02117 switch (len = rb_enc_codelen(code, enc)) {
02118 case ONIGERR_INVALID_CODE_POINT_VALUE:
02119 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02120 break;
02121 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
02122 case 0:
02123 rb_raise(rb_eRangeError, "%u out of char range", code);
02124 break;
02125 }
02126 buf = ALLOCA_N(char, len + 1);
02127 rb_enc_mbcput(code, buf, enc);
02128 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
02129 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
02130 }
02131 rb_str_resize(str1, pos+len);
02132 strncpy(RSTRING_PTR(str1) + pos, buf, len);
02133 if (cr == ENC_CODERANGE_7BIT && code > 127)
02134 cr = ENC_CODERANGE_VALID;
02135 ENC_CODERANGE_SET(str1, cr);
02136 }
02137 return str1;
02138 }
02139
02140
02141
02142
02143
02144
02145
02146
02147
02148
02149
02150
02151 static VALUE
02152 rb_str_prepend(VALUE str, VALUE str2)
02153 {
02154 StringValue(str2);
02155 StringValue(str);
02156 rb_str_update(str, 0L, 0L, str2);
02157 return str;
02158 }
02159
02160 st_index_t
02161 rb_str_hash(VALUE str)
02162 {
02163 int e = ENCODING_GET(str);
02164 if (e && rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02165 e = 0;
02166 }
02167 return rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)) ^ e;
02168 }
02169
02170 int
02171 rb_str_hash_cmp(VALUE str1, VALUE str2)
02172 {
02173 long len;
02174
02175 if (!rb_str_comparable(str1, str2)) return 1;
02176 if (RSTRING_LEN(str1) == (len = RSTRING_LEN(str2)) &&
02177 memcmp(RSTRING_PTR(str1), RSTRING_PTR(str2), len) == 0) {
02178 return 0;
02179 }
02180 return 1;
02181 }
02182
02183
02184
02185
02186
02187
02188
02189
02190 static VALUE
02191 rb_str_hash_m(VALUE str)
02192 {
02193 st_index_t hval = rb_str_hash(str);
02194 return INT2FIX(hval);
02195 }
02196
02197 #define lesser(a,b) (((a)>(b))?(b):(a))
02198
02199 int
02200 rb_str_comparable(VALUE str1, VALUE str2)
02201 {
02202 int idx1, idx2;
02203 int rc1, rc2;
02204
02205 if (RSTRING_LEN(str1) == 0) return TRUE;
02206 if (RSTRING_LEN(str2) == 0) return TRUE;
02207 idx1 = ENCODING_GET(str1);
02208 idx2 = ENCODING_GET(str2);
02209 if (idx1 == idx2) return TRUE;
02210 rc1 = rb_enc_str_coderange(str1);
02211 rc2 = rb_enc_str_coderange(str2);
02212 if (rc1 == ENC_CODERANGE_7BIT) {
02213 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
02214 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
02215 return TRUE;
02216 }
02217 if (rc2 == ENC_CODERANGE_7BIT) {
02218 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
02219 return TRUE;
02220 }
02221 return FALSE;
02222 }
02223
02224 int
02225 rb_str_cmp(VALUE str1, VALUE str2)
02226 {
02227 long len1, len2;
02228 const char *ptr1, *ptr2;
02229 int retval;
02230
02231 if (str1 == str2) return 0;
02232 RSTRING_GETMEM(str1, ptr1, len1);
02233 RSTRING_GETMEM(str2, ptr2, len2);
02234 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
02235 if (len1 == len2) {
02236 if (!rb_str_comparable(str1, str2)) {
02237 if (ENCODING_GET(str1) > ENCODING_GET(str2))
02238 return 1;
02239 return -1;
02240 }
02241 return 0;
02242 }
02243 if (len1 > len2) return 1;
02244 return -1;
02245 }
02246 if (retval > 0) return 1;
02247 return -1;
02248 }
02249
02250
02251 static VALUE
02252 str_eql(const VALUE str1, const VALUE str2)
02253 {
02254 const long len = RSTRING_LEN(str1);
02255 const char *ptr1, *ptr2;
02256
02257 if (len != RSTRING_LEN(str2)) return Qfalse;
02258 if (!rb_str_comparable(str1, str2)) return Qfalse;
02259 if ((ptr1 = RSTRING_PTR(str1)) == (ptr2 = RSTRING_PTR(str2)))
02260 return Qtrue;
02261 if (memcmp(ptr1, ptr2, len) == 0)
02262 return Qtrue;
02263 return Qfalse;
02264 }
02265
02266
02267
02268
02269
02270
02271
02272
02273
02274 VALUE
02275 rb_str_equal(VALUE str1, VALUE str2)
02276 {
02277 if (str1 == str2) return Qtrue;
02278 if (TYPE(str2) != T_STRING) {
02279 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02280 return Qfalse;
02281 }
02282 return rb_equal(str2, str1);
02283 }
02284 return str_eql(str1, str2);
02285 }
02286
02287
02288
02289
02290
02291
02292
02293
02294 static VALUE
02295 rb_str_eql(VALUE str1, VALUE str2)
02296 {
02297 if (str1 == str2) return Qtrue;
02298 if (TYPE(str2) != T_STRING) return Qfalse;
02299 return str_eql(str1, str2);
02300 }
02301
02302
02303
02304
02305
02306
02307
02308
02309
02310
02311
02312
02313
02314
02315
02316
02317
02318
02319
02320
02321
02322
02323
02324
02325 static VALUE
02326 rb_str_cmp_m(VALUE str1, VALUE str2)
02327 {
02328 long result;
02329
02330 if (TYPE(str2) != T_STRING) {
02331 if (!rb_respond_to(str2, rb_intern("to_str"))) {
02332 return Qnil;
02333 }
02334 else if (!rb_respond_to(str2, rb_intern("<=>"))) {
02335 return Qnil;
02336 }
02337 else {
02338 VALUE tmp = rb_funcall(str2, rb_intern("<=>"), 1, str1);
02339
02340 if (NIL_P(tmp)) return Qnil;
02341 if (!FIXNUM_P(tmp)) {
02342 return rb_funcall(LONG2FIX(0), '-', 1, tmp);
02343 }
02344 result = -FIX2LONG(tmp);
02345 }
02346 }
02347 else {
02348 result = rb_str_cmp(str1, str2);
02349 }
02350 return LONG2NUM(result);
02351 }
02352
02353
02354
02355
02356
02357
02358
02359
02360
02361
02362
02363
02364
02365 static VALUE
02366 rb_str_casecmp(VALUE str1, VALUE str2)
02367 {
02368 long len;
02369 rb_encoding *enc;
02370 char *p1, *p1end, *p2, *p2end;
02371
02372 StringValue(str2);
02373 enc = rb_enc_compatible(str1, str2);
02374 if (!enc) {
02375 return Qnil;
02376 }
02377
02378 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
02379 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
02380 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
02381 while (p1 < p1end && p2 < p2end) {
02382 if (*p1 != *p2) {
02383 unsigned int c1 = TOUPPER(*p1 & 0xff);
02384 unsigned int c2 = TOUPPER(*p2 & 0xff);
02385 if (c1 != c2)
02386 return INT2FIX(c1 < c2 ? -1 : 1);
02387 }
02388 p1++;
02389 p2++;
02390 }
02391 }
02392 else {
02393 while (p1 < p1end && p2 < p2end) {
02394 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
02395 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
02396
02397 if (0 <= c1 && 0 <= c2) {
02398 c1 = TOUPPER(c1);
02399 c2 = TOUPPER(c2);
02400 if (c1 != c2)
02401 return INT2FIX(c1 < c2 ? -1 : 1);
02402 }
02403 else {
02404 int r;
02405 l1 = rb_enc_mbclen(p1, p1end, enc);
02406 l2 = rb_enc_mbclen(p2, p2end, enc);
02407 len = l1 < l2 ? l1 : l2;
02408 r = memcmp(p1, p2, len);
02409 if (r != 0)
02410 return INT2FIX(r < 0 ? -1 : 1);
02411 if (l1 != l2)
02412 return INT2FIX(l1 < l2 ? -1 : 1);
02413 }
02414 p1 += l1;
02415 p2 += l2;
02416 }
02417 }
02418 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
02419 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
02420 return INT2FIX(-1);
02421 }
02422
02423 static long
02424 rb_str_index(VALUE str, VALUE sub, long offset)
02425 {
02426 long pos;
02427 char *s, *sptr, *e;
02428 long len, slen;
02429 rb_encoding *enc;
02430
02431 enc = rb_enc_check(str, sub);
02432 if (is_broken_string(sub)) {
02433 return -1;
02434 }
02435 len = str_strlen(str, enc);
02436 slen = str_strlen(sub, enc);
02437 if (offset < 0) {
02438 offset += len;
02439 if (offset < 0) return -1;
02440 }
02441 if (len - offset < slen) return -1;
02442 s = RSTRING_PTR(str);
02443 e = s + RSTRING_LEN(str);
02444 if (offset) {
02445 offset = str_offset(s, RSTRING_END(str), offset, enc, single_byte_optimizable(str));
02446 s += offset;
02447 }
02448 if (slen == 0) return offset;
02449
02450 sptr = RSTRING_PTR(sub);
02451 slen = RSTRING_LEN(sub);
02452 len = RSTRING_LEN(str) - offset;
02453 for (;;) {
02454 char *t;
02455 pos = rb_memsearch(sptr, slen, s, len, enc);
02456 if (pos < 0) return pos;
02457 t = rb_enc_right_char_head(s, s+pos, e, enc);
02458 if (t == s + pos) break;
02459 if ((len -= t - s) <= 0) return -1;
02460 offset += t - s;
02461 s = t;
02462 }
02463 return pos + offset;
02464 }
02465
02466
02467
02468
02469
02470
02471
02472
02473
02474
02475
02476
02477
02478
02479
02480
02481
02482
02483
02484 static VALUE
02485 rb_str_index_m(int argc, VALUE *argv, VALUE str)
02486 {
02487 VALUE sub;
02488 VALUE initpos;
02489 long pos;
02490
02491 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
02492 pos = NUM2LONG(initpos);
02493 }
02494 else {
02495 pos = 0;
02496 }
02497 if (pos < 0) {
02498 pos += str_strlen(str, STR_ENC_GET(str));
02499 if (pos < 0) {
02500 if (TYPE(sub) == T_REGEXP) {
02501 rb_backref_set(Qnil);
02502 }
02503 return Qnil;
02504 }
02505 }
02506
02507 switch (TYPE(sub)) {
02508 case T_REGEXP:
02509 if (pos > str_strlen(str, STR_ENC_GET(str)))
02510 return Qnil;
02511 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02512 rb_enc_check(str, sub), single_byte_optimizable(str));
02513
02514 pos = rb_reg_search(sub, str, pos, 0);
02515 pos = rb_str_sublen(str, pos);
02516 break;
02517
02518 default: {
02519 VALUE tmp;
02520
02521 tmp = rb_check_string_type(sub);
02522 if (NIL_P(tmp)) {
02523 rb_raise(rb_eTypeError, "type mismatch: %s given",
02524 rb_obj_classname(sub));
02525 }
02526 sub = tmp;
02527 }
02528
02529 case T_STRING:
02530 pos = rb_str_index(str, sub, pos);
02531 pos = rb_str_sublen(str, pos);
02532 break;
02533 }
02534
02535 if (pos == -1) return Qnil;
02536 return LONG2NUM(pos);
02537 }
02538
02539 static long
02540 rb_str_rindex(VALUE str, VALUE sub, long pos)
02541 {
02542 long len, slen;
02543 char *s, *sbeg, *e, *t;
02544 rb_encoding *enc;
02545 int singlebyte = single_byte_optimizable(str);
02546
02547 enc = rb_enc_check(str, sub);
02548 if (is_broken_string(sub)) {
02549 return -1;
02550 }
02551 len = str_strlen(str, enc);
02552 slen = str_strlen(sub, enc);
02553
02554 if (len < slen) return -1;
02555 if (len - pos < slen) {
02556 pos = len - slen;
02557 }
02558 if (len == 0) {
02559 return pos;
02560 }
02561 sbeg = RSTRING_PTR(str);
02562 e = RSTRING_END(str);
02563 t = RSTRING_PTR(sub);
02564 slen = RSTRING_LEN(sub);
02565 s = str_nth(sbeg, e, pos, enc, singlebyte);
02566 while (s) {
02567 if (memcmp(s, t, slen) == 0) {
02568 return pos;
02569 }
02570 if (pos == 0) break;
02571 pos--;
02572 s = rb_enc_prev_char(sbeg, s, e, enc);
02573 }
02574 return -1;
02575 }
02576
02577
02578
02579
02580
02581
02582
02583
02584
02585
02586
02587
02588
02589
02590
02591
02592
02593
02594
02595
02596 static VALUE
02597 rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
02598 {
02599 VALUE sub;
02600 VALUE vpos;
02601 rb_encoding *enc = STR_ENC_GET(str);
02602 long pos, len = str_strlen(str, enc);
02603
02604 if (rb_scan_args(argc, argv, "11", &sub, &vpos) == 2) {
02605 pos = NUM2LONG(vpos);
02606 if (pos < 0) {
02607 pos += len;
02608 if (pos < 0) {
02609 if (TYPE(sub) == T_REGEXP) {
02610 rb_backref_set(Qnil);
02611 }
02612 return Qnil;
02613 }
02614 }
02615 if (pos > len) pos = len;
02616 }
02617 else {
02618 pos = len;
02619 }
02620
02621 switch (TYPE(sub)) {
02622 case T_REGEXP:
02623
02624 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
02625 STR_ENC_GET(str), single_byte_optimizable(str));
02626
02627 if (!RREGEXP(sub)->ptr || RREGEXP_SRC_LEN(sub)) {
02628 pos = rb_reg_search(sub, str, pos, 1);
02629 pos = rb_str_sublen(str, pos);
02630 }
02631 if (pos >= 0) return LONG2NUM(pos);
02632 break;
02633
02634 default: {
02635 VALUE tmp;
02636
02637 tmp = rb_check_string_type(sub);
02638 if (NIL_P(tmp)) {
02639 rb_raise(rb_eTypeError, "type mismatch: %s given",
02640 rb_obj_classname(sub));
02641 }
02642 sub = tmp;
02643 }
02644
02645 case T_STRING:
02646 pos = rb_str_rindex(str, sub, pos);
02647 if (pos >= 0) return LONG2NUM(pos);
02648 break;
02649 }
02650 return Qnil;
02651 }
02652
02653
02654
02655
02656
02657
02658
02659
02660
02661
02662
02663
02664
02665
02666
02667 static VALUE
02668 rb_str_match(VALUE x, VALUE y)
02669 {
02670 switch (TYPE(y)) {
02671 case T_STRING:
02672 rb_raise(rb_eTypeError, "type mismatch: String given");
02673
02674 case T_REGEXP:
02675 return rb_reg_match(y, x);
02676
02677 default:
02678 return rb_funcall(y, rb_intern("=~"), 1, x);
02679 }
02680 }
02681
02682
02683 static VALUE get_pat(VALUE, int);
02684
02685
02686
02687
02688
02689
02690
02691
02692
02693
02694
02695
02696
02697
02698
02699
02700
02701
02702
02703
02704
02705
02706
02707
02708
02709
02710
02711
02712
02713
02714
02715 static VALUE
02716 rb_str_match_m(int argc, VALUE *argv, VALUE str)
02717 {
02718 VALUE re, result;
02719 if (argc < 1)
02720 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
02721 re = argv[0];
02722 argv[0] = str;
02723 result = rb_funcall2(get_pat(re, 0), rb_intern("match"), argc, argv);
02724 if (!NIL_P(result) && rb_block_given_p()) {
02725 return rb_yield(result);
02726 }
02727 return result;
02728 }
02729
02730 enum neighbor_char {
02731 NEIGHBOR_NOT_CHAR,
02732 NEIGHBOR_FOUND,
02733 NEIGHBOR_WRAPPED
02734 };
02735
02736 static enum neighbor_char
02737 enc_succ_char(char *p, long len, rb_encoding *enc)
02738 {
02739 long i;
02740 int l;
02741 while (1) {
02742 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
02743 p[i] = '\0';
02744 if (i < 0)
02745 return NEIGHBOR_WRAPPED;
02746 ++((unsigned char*)p)[i];
02747 l = rb_enc_precise_mbclen(p, p+len, enc);
02748 if (MBCLEN_CHARFOUND_P(l)) {
02749 l = MBCLEN_CHARFOUND_LEN(l);
02750 if (l == len) {
02751 return NEIGHBOR_FOUND;
02752 }
02753 else {
02754 memset(p+l, 0xff, len-l);
02755 }
02756 }
02757 if (MBCLEN_INVALID_P(l) && i < len-1) {
02758 long len2;
02759 int l2;
02760 for (len2 = len-1; 0 < len2; len2--) {
02761 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02762 if (!MBCLEN_INVALID_P(l2))
02763 break;
02764 }
02765 memset(p+len2+1, 0xff, len-(len2+1));
02766 }
02767 }
02768 }
02769
02770 static enum neighbor_char
02771 enc_pred_char(char *p, long len, rb_encoding *enc)
02772 {
02773 long i;
02774 int l;
02775 while (1) {
02776 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
02777 p[i] = '\xff';
02778 if (i < 0)
02779 return NEIGHBOR_WRAPPED;
02780 --((unsigned char*)p)[i];
02781 l = rb_enc_precise_mbclen(p, p+len, enc);
02782 if (MBCLEN_CHARFOUND_P(l)) {
02783 l = MBCLEN_CHARFOUND_LEN(l);
02784 if (l == len) {
02785 return NEIGHBOR_FOUND;
02786 }
02787 else {
02788 memset(p+l, 0, len-l);
02789 }
02790 }
02791 if (MBCLEN_INVALID_P(l) && i < len-1) {
02792 long len2;
02793 int l2;
02794 for (len2 = len-1; 0 < len2; len2--) {
02795 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
02796 if (!MBCLEN_INVALID_P(l2))
02797 break;
02798 }
02799 memset(p+len2+1, 0, len-(len2+1));
02800 }
02801 }
02802 }
02803
02804
02805
02806
02807
02808
02809
02810
02811
02812
02813 static enum neighbor_char
02814 enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
02815 {
02816 enum neighbor_char ret;
02817 unsigned int c;
02818 int ctype;
02819 int range;
02820 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
02821
02822 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02823 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
02824 ctype = ONIGENC_CTYPE_DIGIT;
02825 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
02826 ctype = ONIGENC_CTYPE_ALPHA;
02827 else
02828 return NEIGHBOR_NOT_CHAR;
02829
02830 MEMCPY(save, p, char, len);
02831 ret = enc_succ_char(p, len, enc);
02832 if (ret == NEIGHBOR_FOUND) {
02833 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02834 if (rb_enc_isctype(c, ctype, enc))
02835 return NEIGHBOR_FOUND;
02836 }
02837 MEMCPY(p, save, char, len);
02838 range = 1;
02839 while (1) {
02840 MEMCPY(save, p, char, len);
02841 ret = enc_pred_char(p, len, enc);
02842 if (ret == NEIGHBOR_FOUND) {
02843 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
02844 if (!rb_enc_isctype(c, ctype, enc)) {
02845 MEMCPY(p, save, char, len);
02846 break;
02847 }
02848 }
02849 else {
02850 MEMCPY(p, save, char, len);
02851 break;
02852 }
02853 range++;
02854 }
02855 if (range == 1) {
02856 return NEIGHBOR_NOT_CHAR;
02857 }
02858
02859 if (ctype != ONIGENC_CTYPE_DIGIT) {
02860 MEMCPY(carry, p, char, len);
02861 return NEIGHBOR_WRAPPED;
02862 }
02863
02864 MEMCPY(carry, p, char, len);
02865 enc_succ_char(carry, len, enc);
02866 return NEIGHBOR_WRAPPED;
02867 }
02868
02869
02870
02871
02872
02873
02874
02875
02876
02877
02878
02879
02880
02881
02882
02883
02884
02885
02886
02887
02888
02889
02890
02891
02892
02893
02894
02895 VALUE
02896 rb_str_succ(VALUE orig)
02897 {
02898 rb_encoding *enc;
02899 VALUE str;
02900 char *sbeg, *s, *e, *last_alnum = 0;
02901 int c = -1;
02902 long l;
02903 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
02904 long carry_pos = 0, carry_len = 1;
02905 enum neighbor_char neighbor = NEIGHBOR_FOUND;
02906
02907 str = rb_str_new5(orig, RSTRING_PTR(orig), RSTRING_LEN(orig));
02908 rb_enc_cr_str_copy_for_substr(str, orig);
02909 OBJ_INFECT(str, orig);
02910 if (RSTRING_LEN(str) == 0) return str;
02911
02912 enc = STR_ENC_GET(orig);
02913 sbeg = RSTRING_PTR(str);
02914 s = e = sbeg + RSTRING_LEN(str);
02915
02916 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02917 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
02918 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
02919 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
02920 s = last_alnum;
02921 break;
02922 }
02923 }
02924 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02925 neighbor = enc_succ_alnum_char(s, l, enc, carry);
02926 switch (neighbor) {
02927 case NEIGHBOR_NOT_CHAR:
02928 continue;
02929 case NEIGHBOR_FOUND:
02930 return str;
02931 case NEIGHBOR_WRAPPED:
02932 last_alnum = s;
02933 break;
02934 }
02935 c = 1;
02936 carry_pos = s - sbeg;
02937 carry_len = l;
02938 }
02939 if (c == -1) {
02940 s = e;
02941 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
02942 enum neighbor_char neighbor;
02943 if ((l = rb_enc_precise_mbclen(s, e, enc)) <= 0) continue;
02944 neighbor = enc_succ_char(s, l, enc);
02945 if (neighbor == NEIGHBOR_FOUND)
02946 return str;
02947 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
02948
02949 enc_succ_char(s, l, enc);
02950 }
02951 if (!rb_enc_asciicompat(enc)) {
02952 MEMCPY(carry, s, char, l);
02953 carry_len = l;
02954 }
02955 carry_pos = s - sbeg;
02956 }
02957 }
02958 RESIZE_CAPA(str, RSTRING_LEN(str) + carry_len);
02959 s = RSTRING_PTR(str) + carry_pos;
02960 memmove(s + carry_len, s, RSTRING_LEN(str) - carry_pos);
02961 memmove(s, carry, carry_len);
02962 STR_SET_LEN(str, RSTRING_LEN(str) + carry_len);
02963 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
02964 rb_enc_str_coderange(str);
02965 return str;
02966 }
02967
02968
02969
02970
02971
02972
02973
02974
02975
02976
02977
02978 static VALUE
02979 rb_str_succ_bang(VALUE str)
02980 {
02981 rb_str_shared_replace(str, rb_str_succ(str));
02982
02983 return str;
02984 }
02985
02986
02987
02988
02989
02990
02991
02992
02993
02994
02995
02996
02997
02998
02999
03000
03001
03002
03003
03004
03005
03006
03007
03008
03009
03010
03011
03012
03013
03014
03015
03016
03017
03018
03019 static VALUE
03020 rb_str_upto(int argc, VALUE *argv, VALUE beg)
03021 {
03022 VALUE end, exclusive;
03023 VALUE current, after_end;
03024 ID succ;
03025 int n, excl, ascii;
03026 rb_encoding *enc;
03027
03028 rb_scan_args(argc, argv, "11", &end, &exclusive);
03029 RETURN_ENUMERATOR(beg, argc, argv);
03030 excl = RTEST(exclusive);
03031 CONST_ID(succ, "succ");
03032 StringValue(end);
03033 enc = rb_enc_check(beg, end);
03034 ascii = (is_ascii_string(beg) && is_ascii_string(end));
03035
03036 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
03037 char c = RSTRING_PTR(beg)[0];
03038 char e = RSTRING_PTR(end)[0];
03039
03040 if (c > e || (excl && c == e)) return beg;
03041 for (;;) {
03042 rb_yield(rb_enc_str_new(&c, 1, enc));
03043 if (!excl && c == e) break;
03044 c++;
03045 if (excl && c == e) break;
03046 }
03047 return beg;
03048 }
03049
03050 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0])) {
03051 char *s, *send;
03052 VALUE b, e;
03053 int width;
03054
03055 s = RSTRING_PTR(beg); send = RSTRING_END(beg);
03056 width = rb_long2int(send - s);
03057 while (s < send) {
03058 if (!ISDIGIT(*s)) goto no_digits;
03059 s++;
03060 }
03061 s = RSTRING_PTR(end); send = RSTRING_END(end);
03062 while (s < send) {
03063 if (!ISDIGIT(*s)) goto no_digits;
03064 s++;
03065 }
03066 b = rb_str_to_inum(beg, 10, FALSE);
03067 e = rb_str_to_inum(end, 10, FALSE);
03068 if (FIXNUM_P(b) && FIXNUM_P(e)) {
03069 long bi = FIX2LONG(b);
03070 long ei = FIX2LONG(e);
03071 rb_encoding *usascii = rb_usascii_encoding();
03072
03073 while (bi <= ei) {
03074 if (excl && bi == ei) break;
03075 rb_yield(rb_enc_sprintf(usascii, "%.*ld", width, bi));
03076 bi++;
03077 }
03078 }
03079 else {
03080 ID op = excl ? '<' : rb_intern("<=");
03081 VALUE args[2], fmt = rb_obj_freeze(rb_usascii_str_new_cstr("%.*d"));
03082
03083 args[0] = INT2FIX(width);
03084 while (rb_funcall(b, op, 1, e)) {
03085 args[1] = b;
03086 rb_yield(rb_str_format(numberof(args), args, fmt));
03087 b = rb_funcall(b, succ, 0, 0);
03088 }
03089 }
03090 return beg;
03091 }
03092
03093 no_digits:
03094 n = rb_str_cmp(beg, end);
03095 if (n > 0 || (excl && n == 0)) return beg;
03096
03097 after_end = rb_funcall(end, succ, 0, 0);
03098 current = rb_str_dup(beg);
03099 while (!rb_str_equal(current, after_end)) {
03100 VALUE next = Qnil;
03101 if (excl || !rb_str_equal(current, end))
03102 next = rb_funcall(current, succ, 0, 0);
03103 rb_yield(current);
03104 if (NIL_P(next)) break;
03105 current = next;
03106 StringValue(current);
03107 if (excl && rb_str_equal(current, end)) break;
03108 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
03109 break;
03110 }
03111
03112 return beg;
03113 }
03114
03115 static VALUE
03116 rb_str_subpat(VALUE str, VALUE re, VALUE backref)
03117 {
03118 if (rb_reg_search(re, str, 0, 0) >= 0) {
03119 VALUE match = rb_backref_get();
03120 int nth = rb_reg_backref_number(match, backref);
03121 return rb_reg_nth_match(nth, match);
03122 }
03123 return Qnil;
03124 }
03125
03126 static VALUE
03127 rb_str_aref(VALUE str, VALUE indx)
03128 {
03129 long idx;
03130
03131 switch (TYPE(indx)) {
03132 case T_FIXNUM:
03133 idx = FIX2LONG(indx);
03134
03135 num_index:
03136 str = rb_str_substr(str, idx, 1);
03137 if (!NIL_P(str) && RSTRING_LEN(str) == 0) return Qnil;
03138 return str;
03139
03140 case T_REGEXP:
03141 return rb_str_subpat(str, indx, INT2FIX(0));
03142
03143 case T_STRING:
03144 if (rb_str_index(str, indx, 0) != -1)
03145 return rb_str_dup(indx);
03146 return Qnil;
03147
03148 default:
03149
03150 {
03151 long beg, len;
03152 VALUE tmp;
03153
03154 len = str_strlen(str, STR_ENC_GET(str));
03155 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
03156 case Qfalse:
03157 break;
03158 case Qnil:
03159 return Qnil;
03160 default:
03161 tmp = rb_str_substr(str, beg, len);
03162 return tmp;
03163 }
03164 }
03165 idx = NUM2LONG(indx);
03166 goto num_index;
03167 }
03168 return Qnil;
03169 }
03170
03171
03172
03173
03174
03175
03176
03177
03178
03179
03180
03181
03182
03183
03184
03185
03186
03187
03188
03189
03190
03191
03192
03193
03194
03195
03196
03197
03198
03199
03200
03201
03202
03203
03204
03205
03206
03207
03208
03209
03210
03211
03212
03213
03214
03215
03216
03217
03218
03219
03220
03221 static VALUE
03222 rb_str_aref_m(int argc, VALUE *argv, VALUE str)
03223 {
03224 if (argc == 2) {
03225 if (TYPE(argv[0]) == T_REGEXP) {
03226 return rb_str_subpat(str, argv[0], argv[1]);
03227 }
03228 return rb_str_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
03229 }
03230 if (argc != 1) {
03231 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03232 }
03233 return rb_str_aref(str, argv[0]);
03234 }
03235
03236 VALUE
03237 rb_str_drop_bytes(VALUE str, long len)
03238 {
03239 char *ptr = RSTRING_PTR(str);
03240 long olen = RSTRING_LEN(str), nlen;
03241
03242 str_modifiable(str);
03243 if (len > olen) len = olen;
03244 nlen = olen - len;
03245 if (nlen <= RSTRING_EMBED_LEN_MAX) {
03246 char *oldptr = ptr;
03247 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|ELTS_SHARED));
03248 STR_SET_EMBED(str);
03249 STR_SET_EMBED_LEN(str, nlen);
03250 ptr = RSTRING(str)->as.ary;
03251 memmove(ptr, oldptr + len, nlen);
03252 if (fl == STR_NOEMBED) xfree(oldptr);
03253 }
03254 else {
03255 if (!STR_SHARED_P(str)) rb_str_new4(str);
03256 ptr = RSTRING(str)->as.heap.ptr += len;
03257 RSTRING(str)->as.heap.len = nlen;
03258 }
03259 ptr[nlen] = 0;
03260 ENC_CODERANGE_CLEAR(str);
03261 return str;
03262 }
03263
03264 static void
03265 rb_str_splice_0(VALUE str, long beg, long len, VALUE val)
03266 {
03267 if (beg == 0 && RSTRING_LEN(val) == 0) {
03268 rb_str_drop_bytes(str, len);
03269 OBJ_INFECT(str, val);
03270 return;
03271 }
03272
03273 rb_str_modify(str);
03274 if (len < RSTRING_LEN(val)) {
03275
03276 RESIZE_CAPA(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len + 1);
03277 }
03278
03279 if (RSTRING_LEN(val) != len) {
03280 memmove(RSTRING_PTR(str) + beg + RSTRING_LEN(val),
03281 RSTRING_PTR(str) + beg + len,
03282 RSTRING_LEN(str) - (beg + len));
03283 }
03284 if (RSTRING_LEN(val) < beg && len < 0) {
03285 MEMZERO(RSTRING_PTR(str) + RSTRING_LEN(str), char, -len);
03286 }
03287 if (RSTRING_LEN(val) > 0) {
03288 memmove(RSTRING_PTR(str)+beg, RSTRING_PTR(val), RSTRING_LEN(val));
03289 }
03290 STR_SET_LEN(str, RSTRING_LEN(str) + RSTRING_LEN(val) - len);
03291 if (RSTRING_PTR(str)) {
03292 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
03293 }
03294 OBJ_INFECT(str, val);
03295 }
03296
03297 static void
03298 rb_str_splice(VALUE str, long beg, long len, VALUE val)
03299 {
03300 long slen;
03301 char *p, *e;
03302 rb_encoding *enc;
03303 int singlebyte = single_byte_optimizable(str);
03304 int cr;
03305
03306 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
03307
03308 StringValue(val);
03309 enc = rb_enc_check(str, val);
03310 slen = str_strlen(str, enc);
03311
03312 if (slen < beg) {
03313 out_of_range:
03314 rb_raise(rb_eIndexError, "index %ld out of string", beg);
03315 }
03316 if (beg < 0) {
03317 if (-beg > slen) {
03318 goto out_of_range;
03319 }
03320 beg += slen;
03321 }
03322 if (slen < len || slen < beg + len) {
03323 len = slen - beg;
03324 }
03325 str_modify_keep_cr(str);
03326 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
03327 if (!p) p = RSTRING_END(str);
03328 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
03329 if (!e) e = RSTRING_END(str);
03330
03331 beg = p - RSTRING_PTR(str);
03332 len = e - p;
03333 rb_str_splice_0(str, beg, len, val);
03334 rb_enc_associate(str, enc);
03335 cr = ENC_CODERANGE_AND(ENC_CODERANGE(str), ENC_CODERANGE(val));
03336 if (cr != ENC_CODERANGE_BROKEN)
03337 ENC_CODERANGE_SET(str, cr);
03338 }
03339
03340 void
03341 rb_str_update(VALUE str, long beg, long len, VALUE val)
03342 {
03343 rb_str_splice(str, beg, len, val);
03344 }
03345
03346 static void
03347 rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
03348 {
03349 int nth;
03350 VALUE match;
03351 long start, end, len;
03352 rb_encoding *enc;
03353 struct re_registers *regs;
03354
03355 if (rb_reg_search(re, str, 0, 0) < 0) {
03356 rb_raise(rb_eIndexError, "regexp not matched");
03357 }
03358 match = rb_backref_get();
03359 nth = rb_reg_backref_number(match, backref);
03360 regs = RMATCH_REGS(match);
03361 if (nth >= regs->num_regs) {
03362 out_of_range:
03363 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
03364 }
03365 if (nth < 0) {
03366 if (-nth >= regs->num_regs) {
03367 goto out_of_range;
03368 }
03369 nth += regs->num_regs;
03370 }
03371
03372 start = BEG(nth);
03373 if (start == -1) {
03374 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
03375 }
03376 end = END(nth);
03377 len = end - start;
03378 StringValue(val);
03379 enc = rb_enc_check(str, val);
03380 rb_str_splice_0(str, start, len, val);
03381 rb_enc_associate(str, enc);
03382 }
03383
03384 static VALUE
03385 rb_str_aset(VALUE str, VALUE indx, VALUE val)
03386 {
03387 long idx, beg;
03388
03389 switch (TYPE(indx)) {
03390 case T_FIXNUM:
03391 idx = FIX2LONG(indx);
03392 num_index:
03393 rb_str_splice(str, idx, 1, val);
03394 return val;
03395
03396 case T_REGEXP:
03397 rb_str_subpat_set(str, indx, INT2FIX(0), val);
03398 return val;
03399
03400 case T_STRING:
03401 beg = rb_str_index(str, indx, 0);
03402 if (beg < 0) {
03403 rb_raise(rb_eIndexError, "string not matched");
03404 }
03405 beg = rb_str_sublen(str, beg);
03406 rb_str_splice(str, beg, str_strlen(indx, 0), val);
03407 return val;
03408
03409 default:
03410
03411 {
03412 long beg, len;
03413 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, 0), 2)) {
03414 rb_str_splice(str, beg, len, val);
03415 return val;
03416 }
03417 }
03418 idx = NUM2LONG(indx);
03419 goto num_index;
03420 }
03421 }
03422
03423
03424
03425
03426
03427
03428
03429
03430
03431
03432
03433
03434
03435
03436
03437
03438
03439
03440
03441
03442
03443
03444
03445
03446
03447
03448 static VALUE
03449 rb_str_aset_m(int argc, VALUE *argv, VALUE str)
03450 {
03451 if (argc == 3) {
03452 if (TYPE(argv[0]) == T_REGEXP) {
03453 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
03454 }
03455 else {
03456 rb_str_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
03457 }
03458 return argv[2];
03459 }
03460 if (argc != 2) {
03461 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)", argc);
03462 }
03463 return rb_str_aset(str, argv[0], argv[1]);
03464 }
03465
03466
03467
03468
03469
03470
03471
03472
03473
03474
03475
03476
03477
03478
03479
03480
03481
03482
03483 static VALUE
03484 rb_str_insert(VALUE str, VALUE idx, VALUE str2)
03485 {
03486 long pos = NUM2LONG(idx);
03487
03488 if (pos == -1) {
03489 return rb_str_append(str, str2);
03490 }
03491 else if (pos < 0) {
03492 pos++;
03493 }
03494 rb_str_splice(str, pos, 0, str2);
03495 return str;
03496 }
03497
03498
03499
03500
03501
03502
03503
03504
03505
03506
03507
03508
03509
03510
03511
03512
03513
03514
03515
03516
03517
03518 static VALUE
03519 rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
03520 {
03521 VALUE result;
03522 VALUE buf[3];
03523 int i;
03524
03525 if (argc < 1 || 2 < argc) {
03526 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03527 }
03528 for (i=0; i<argc; i++) {
03529 buf[i] = argv[i];
03530 }
03531 str_modify_keep_cr(str);
03532 result = rb_str_aref_m(argc, buf, str);
03533 if (!NIL_P(result)) {
03534 buf[i] = rb_str_new(0,0);
03535 rb_str_aset_m(argc+1, buf, str);
03536 }
03537 return result;
03538 }
03539
03540 static VALUE
03541 get_pat(VALUE pat, int quote)
03542 {
03543 VALUE val;
03544
03545 switch (TYPE(pat)) {
03546 case T_REGEXP:
03547 return pat;
03548
03549 case T_STRING:
03550 break;
03551
03552 default:
03553 val = rb_check_string_type(pat);
03554 if (NIL_P(val)) {
03555 Check_Type(pat, T_REGEXP);
03556 }
03557 pat = val;
03558 }
03559
03560 if (quote) {
03561 pat = rb_reg_quote(pat);
03562 }
03563
03564 return rb_reg_regcomp(pat);
03565 }
03566
03567
03568
03569
03570
03571
03572
03573
03574
03575
03576
03577
03578 static VALUE
03579 rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
03580 {
03581 VALUE pat, repl, hash = Qnil;
03582 int iter = 0;
03583 int tainted = 0;
03584 int untrusted = 0;
03585 long plen;
03586
03587 if (argc == 1 && rb_block_given_p()) {
03588 iter = 1;
03589 }
03590 else if (argc == 2) {
03591 repl = argv[1];
03592 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03593 if (NIL_P(hash)) {
03594 StringValue(repl);
03595 }
03596 if (OBJ_TAINTED(repl)) tainted = 1;
03597 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03598 }
03599 else {
03600 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03601 }
03602
03603 pat = get_pat(argv[0], 1);
03604 str_modifiable(str);
03605 if (rb_reg_search(pat, str, 0, 0) >= 0) {
03606 rb_encoding *enc;
03607 int cr = ENC_CODERANGE(str);
03608 VALUE match = rb_backref_get();
03609 struct re_registers *regs = RMATCH_REGS(match);
03610 long beg0 = BEG(0);
03611 long end0 = END(0);
03612 char *p, *rp;
03613 long len, rlen;
03614
03615 if (iter || !NIL_P(hash)) {
03616 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03617
03618 if (iter) {
03619 repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03620 }
03621 else {
03622 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
03623 repl = rb_obj_as_string(repl);
03624 }
03625 str_mod_check(str, p, len);
03626 rb_check_frozen(str);
03627 }
03628 else {
03629 repl = rb_reg_regsub(repl, str, regs, pat);
03630 }
03631 enc = rb_enc_compatible(str, repl);
03632 if (!enc) {
03633 rb_encoding *str_enc = STR_ENC_GET(str);
03634 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
03635 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
03636 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
03637 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
03638 rb_enc_name(str_enc),
03639 rb_enc_name(STR_ENC_GET(repl)));
03640 }
03641 enc = STR_ENC_GET(repl);
03642 }
03643 rb_str_modify(str);
03644 rb_enc_associate(str, enc);
03645 if (OBJ_TAINTED(repl)) tainted = 1;
03646 if (OBJ_UNTRUSTED(repl)) untrusted = 1;
03647 if (ENC_CODERANGE_UNKNOWN < cr && cr < ENC_CODERANGE_BROKEN) {
03648 int cr2 = ENC_CODERANGE(repl);
03649 if (cr2 == ENC_CODERANGE_BROKEN ||
03650 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
03651 cr = ENC_CODERANGE_UNKNOWN;
03652 else
03653 cr = cr2;
03654 }
03655 plen = end0 - beg0;
03656 rp = RSTRING_PTR(repl); rlen = RSTRING_LEN(repl);
03657 len = RSTRING_LEN(str);
03658 if (rlen > plen) {
03659 RESIZE_CAPA(str, len + rlen - plen);
03660 }
03661 p = RSTRING_PTR(str);
03662 if (rlen != plen) {
03663 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
03664 }
03665 memcpy(p + beg0, rp, rlen);
03666 len += rlen - plen;
03667 STR_SET_LEN(str, len);
03668 RSTRING_PTR(str)[len] = '\0';
03669 ENC_CODERANGE_SET(str, cr);
03670 if (tainted) OBJ_TAINT(str);
03671 if (untrusted) OBJ_UNTRUST(str);
03672
03673 return str;
03674 }
03675 return Qnil;
03676 }
03677
03678
03679
03680
03681
03682
03683
03684
03685
03686
03687
03688
03689
03690
03691
03692
03693
03694
03695
03696
03697
03698
03699
03700
03701
03702
03703
03704
03705
03706
03707
03708
03709
03710
03711
03712
03713
03714
03715
03716
03717
03718
03719 static VALUE
03720 rb_str_sub(int argc, VALUE *argv, VALUE str)
03721 {
03722 str = rb_str_dup(str);
03723 rb_str_sub_bang(argc, argv, str);
03724 return str;
03725 }
03726
03727 static VALUE
03728 str_gsub(int argc, VALUE *argv, VALUE str, int bang)
03729 {
03730 VALUE pat, val, repl, match, dest, hash = Qnil;
03731 struct re_registers *regs;
03732 long beg, n;
03733 long beg0, end0;
03734 long offset, blen, slen, len, last;
03735 int iter = 0;
03736 char *sp, *cp;
03737 int tainted = 0;
03738 rb_encoding *str_enc;
03739
03740 switch (argc) {
03741 case 1:
03742 RETURN_ENUMERATOR(str, argc, argv);
03743 iter = 1;
03744 break;
03745 case 2:
03746 repl = argv[1];
03747 hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
03748 if (NIL_P(hash)) {
03749 StringValue(repl);
03750 }
03751 if (OBJ_TAINTED(repl)) tainted = 1;
03752 break;
03753 default:
03754 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
03755 }
03756
03757 pat = get_pat(argv[0], 1);
03758 beg = rb_reg_search(pat, str, 0, 0);
03759 if (beg < 0) {
03760 if (bang) return Qnil;
03761 return rb_str_dup(str);
03762 }
03763
03764 offset = 0;
03765 n = 0;
03766 blen = RSTRING_LEN(str) + 30;
03767 dest = rb_str_buf_new(blen);
03768 sp = RSTRING_PTR(str);
03769 slen = RSTRING_LEN(str);
03770 cp = sp;
03771 str_enc = STR_ENC_GET(str);
03772 rb_enc_associate(dest, str_enc);
03773 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
03774
03775 do {
03776 n++;
03777 match = rb_backref_get();
03778 regs = RMATCH_REGS(match);
03779 beg0 = BEG(0);
03780 end0 = END(0);
03781 if (iter || !NIL_P(hash)) {
03782 if (iter) {
03783 val = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
03784 }
03785 else {
03786 val = rb_hash_aref(hash, rb_str_subseq(str, BEG(0), END(0) - BEG(0)));
03787 val = rb_obj_as_string(val);
03788 }
03789 str_mod_check(str, sp, slen);
03790 if (val == dest) {
03791 rb_raise(rb_eRuntimeError, "block should not cheat");
03792 }
03793 }
03794 else {
03795 val = rb_reg_regsub(repl, str, regs, pat);
03796 }
03797
03798 if (OBJ_TAINTED(val)) tainted = 1;
03799
03800 len = beg - offset;
03801 if (len) {
03802 rb_enc_str_buf_cat(dest, cp, len, str_enc);
03803 }
03804
03805 rb_str_buf_append(dest, val);
03806
03807 last = offset;
03808 offset = end0;
03809 if (beg0 == end0) {
03810
03811
03812
03813
03814 if (RSTRING_LEN(str) <= end0) break;
03815 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
03816 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
03817 offset = end0 + len;
03818 }
03819 cp = RSTRING_PTR(str) + offset;
03820 if (offset > RSTRING_LEN(str)) break;
03821 beg = rb_reg_search(pat, str, offset, 0);
03822 } while (beg >= 0);
03823 if (RSTRING_LEN(str) > offset) {
03824 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
03825 }
03826 rb_reg_search(pat, str, last, 0);
03827 if (bang) {
03828 rb_str_shared_replace(str, dest);
03829 }
03830 else {
03831 RBASIC(dest)->klass = rb_obj_class(str);
03832 OBJ_INFECT(dest, str);
03833 str = dest;
03834 }
03835
03836 if (tainted) OBJ_TAINT(str);
03837 return str;
03838 }
03839
03840
03841
03842
03843
03844
03845
03846
03847
03848
03849
03850
03851
03852 static VALUE
03853 rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
03854 {
03855 str_modify_keep_cr(str);
03856 return str_gsub(argc, argv, str, 1);
03857 }
03858
03859
03860
03861
03862
03863
03864
03865
03866
03867
03868
03869
03870
03871
03872
03873
03874
03875
03876
03877
03878
03879
03880
03881
03882
03883
03884
03885
03886
03887
03888
03889
03890
03891
03892
03893
03894
03895
03896
03897
03898
03899
03900
03901
03902
03903 static VALUE
03904 rb_str_gsub(int argc, VALUE *argv, VALUE str)
03905 {
03906 return str_gsub(argc, argv, str, 0);
03907 }
03908
03909
03910
03911
03912
03913
03914
03915
03916
03917
03918
03919
03920
03921 VALUE
03922 rb_str_replace(VALUE str, VALUE str2)
03923 {
03924 str_modifiable(str);
03925 if (str == str2) return str;
03926
03927 StringValue(str2);
03928 str_discard(str);
03929 return str_replace(str, str2);
03930 }
03931
03932
03933
03934
03935
03936
03937
03938
03939
03940
03941
03942 static VALUE
03943 rb_str_clear(VALUE str)
03944 {
03945 str_discard(str);
03946 STR_SET_EMBED(str);
03947 STR_SET_EMBED_LEN(str, 0);
03948 RSTRING_PTR(str)[0] = 0;
03949 if (rb_enc_asciicompat(STR_ENC_GET(str)))
03950 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
03951 else
03952 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
03953 return str;
03954 }
03955
03956
03957
03958
03959
03960
03961
03962
03963
03964
03965
03966 static VALUE
03967 rb_str_chr(VALUE str)
03968 {
03969 return rb_str_substr(str, 0, 1);
03970 }
03971
03972
03973
03974
03975
03976
03977
03978 static VALUE
03979 rb_str_getbyte(VALUE str, VALUE index)
03980 {
03981 long pos = NUM2LONG(index);
03982
03983 if (pos < 0)
03984 pos += RSTRING_LEN(str);
03985 if (pos < 0 || RSTRING_LEN(str) <= pos)
03986 return Qnil;
03987
03988 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
03989 }
03990
03991
03992
03993
03994
03995
03996
03997 static VALUE
03998 rb_str_setbyte(VALUE str, VALUE index, VALUE value)
03999 {
04000 long pos = NUM2LONG(index);
04001 int byte = NUM2INT(value);
04002
04003 rb_str_modify(str);
04004
04005 if (pos < -RSTRING_LEN(str) || RSTRING_LEN(str) <= pos)
04006 rb_raise(rb_eIndexError, "index %ld out of string", pos);
04007 if (pos < 0)
04008 pos += RSTRING_LEN(str);
04009
04010 RSTRING_PTR(str)[pos] = byte;
04011
04012 return value;
04013 }
04014
04015 static VALUE
04016 str_byte_substr(VALUE str, long beg, long len)
04017 {
04018 char *p, *s = RSTRING_PTR(str);
04019 long n = RSTRING_LEN(str);
04020 VALUE str2;
04021
04022 if (beg > n || len < 0) return Qnil;
04023 if (beg < 0) {
04024 beg += n;
04025 if (beg < 0) return Qnil;
04026 }
04027 if (beg + len > n)
04028 len = n - beg;
04029 if (len <= 0) {
04030 len = 0;
04031 p = 0;
04032 }
04033 else
04034 p = s + beg;
04035
04036 if (len > RSTRING_EMBED_LEN_MAX && beg + len == n) {
04037 str2 = rb_str_new4(str);
04038 str2 = str_new3(rb_obj_class(str2), str2);
04039 RSTRING(str2)->as.heap.ptr += RSTRING(str2)->as.heap.len - len;
04040 RSTRING(str2)->as.heap.len = len;
04041 }
04042 else {
04043 str2 = rb_str_new5(str, p, len);
04044 rb_enc_cr_str_copy_for_substr(str2, str);
04045 OBJ_INFECT(str2, str);
04046 }
04047
04048 return str2;
04049 }
04050
04051 static VALUE
04052 str_byte_aref(VALUE str, VALUE indx)
04053 {
04054 long idx;
04055 switch (TYPE(indx)) {
04056 case T_FIXNUM:
04057 idx = FIX2LONG(indx);
04058
04059 num_index:
04060 str = str_byte_substr(str, idx, 1);
04061 if (NIL_P(str) || RSTRING_LEN(str) == 0) return Qnil;
04062 return str;
04063
04064 default:
04065
04066 {
04067 long beg, len = RSTRING_LEN(str);
04068
04069 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
04070 case Qfalse:
04071 break;
04072 case Qnil:
04073 return Qnil;
04074 default:
04075 return str_byte_substr(str, beg, len);
04076 }
04077 }
04078 idx = NUM2LONG(indx);
04079 goto num_index;
04080 }
04081 return Qnil;
04082 }
04083
04084
04085
04086
04087
04088
04089
04090
04091
04092
04093
04094
04095
04096
04097
04098
04099
04100
04101
04102
04103
04104
04105
04106
04107 static VALUE
04108 rb_str_byteslice(int argc, VALUE *argv, VALUE str)
04109 {
04110 if (argc == 2) {
04111 return str_byte_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
04112 }
04113 if (argc != 1) {
04114 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
04115 }
04116 return str_byte_aref(str, argv[0]);
04117 }
04118
04119
04120
04121
04122
04123
04124
04125
04126
04127
04128 static VALUE
04129 rb_str_reverse(VALUE str)
04130 {
04131 rb_encoding *enc;
04132 VALUE rev;
04133 char *s, *e, *p;
04134 int single = 1;
04135
04136 if (RSTRING_LEN(str) <= 1) return rb_str_dup(str);
04137 enc = STR_ENC_GET(str);
04138 rev = rb_str_new5(str, 0, RSTRING_LEN(str));
04139 s = RSTRING_PTR(str); e = RSTRING_END(str);
04140 p = RSTRING_END(rev);
04141
04142 if (RSTRING_LEN(str) > 1) {
04143 if (single_byte_optimizable(str)) {
04144 while (s < e) {
04145 *--p = *s++;
04146 }
04147 }
04148 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID) {
04149 while (s < e) {
04150 int clen = rb_enc_fast_mbclen(s, e, enc);
04151
04152 if (clen > 1 || (*s & 0x80)) single = 0;
04153 p -= clen;
04154 memcpy(p, s, clen);
04155 s += clen;
04156 }
04157 }
04158 else {
04159 while (s < e) {
04160 int clen = rb_enc_mbclen(s, e, enc);
04161
04162 if (clen > 1 || (*s & 0x80)) single = 0;
04163 p -= clen;
04164 memcpy(p, s, clen);
04165 s += clen;
04166 }
04167 }
04168 }
04169 STR_SET_LEN(rev, RSTRING_LEN(str));
04170 OBJ_INFECT(rev, str);
04171 if (ENC_CODERANGE(str) == ENC_CODERANGE_UNKNOWN) {
04172 if (single) {
04173 ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
04174 }
04175 else {
04176 ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
04177 }
04178 }
04179 rb_enc_cr_str_copy_for_substr(rev, str);
04180
04181 return rev;
04182 }
04183
04184
04185
04186
04187
04188
04189
04190
04191
04192 static VALUE
04193 rb_str_reverse_bang(VALUE str)
04194 {
04195 if (RSTRING_LEN(str) > 1) {
04196 if (single_byte_optimizable(str)) {
04197 char *s, *e, c;
04198
04199 str_modify_keep_cr(str);
04200 s = RSTRING_PTR(str);
04201 e = RSTRING_END(str) - 1;
04202 while (s < e) {
04203 c = *s;
04204 *s++ = *e;
04205 *e-- = c;
04206 }
04207 }
04208 else {
04209 rb_str_shared_replace(str, rb_str_reverse(str));
04210 }
04211 }
04212 else {
04213 str_modify_keep_cr(str);
04214 }
04215 return str;
04216 }
04217
04218
04219
04220
04221
04222
04223
04224
04225
04226
04227
04228
04229
04230
04231 static VALUE
04232 rb_str_include(VALUE str, VALUE arg)
04233 {
04234 long i;
04235
04236 StringValue(arg);
04237 i = rb_str_index(str, arg, 0);
04238
04239 if (i == -1) return Qfalse;
04240 return Qtrue;
04241 }
04242
04243
04244
04245
04246
04247
04248
04249
04250
04251
04252
04253
04254
04255
04256
04257
04258
04259
04260
04261
04262
04263
04264
04265 static VALUE
04266 rb_str_to_i(int argc, VALUE *argv, VALUE str)
04267 {
04268 int base;
04269
04270 if (argc == 0) base = 10;
04271 else {
04272 VALUE b;
04273
04274 rb_scan_args(argc, argv, "01", &b);
04275 base = NUM2INT(b);
04276 }
04277 if (base < 0) {
04278 rb_raise(rb_eArgError, "invalid radix %d", base);
04279 }
04280 return rb_str_to_inum(str, base, FALSE);
04281 }
04282
04283
04284
04285
04286
04287
04288
04289
04290
04291
04292
04293
04294
04295
04296
04297
04298 static VALUE
04299 rb_str_to_f(VALUE str)
04300 {
04301 return DBL2NUM(rb_str_to_dbl(str, FALSE));
04302 }
04303
04304
04305
04306
04307
04308
04309
04310
04311
04312
04313 static VALUE
04314 rb_str_to_s(VALUE str)
04315 {
04316 if (rb_obj_class(str) != rb_cString) {
04317 return str_duplicate(rb_cString, str);
04318 }
04319 return str;
04320 }
04321
04322 #if 0
04323 static void
04324 str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
04325 {
04326 char s[RUBY_MAX_CHAR_LEN];
04327 int n = rb_enc_codelen(c, enc);
04328
04329 rb_enc_mbcput(c, s, enc);
04330 rb_enc_str_buf_cat(str, s, n, enc);
04331 }
04332 #endif
04333
04334 #define CHAR_ESC_LEN 13
04335
04336 int
04337 rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
04338 {
04339 char buf[CHAR_ESC_LEN + 1];
04340 int l;
04341
04342 #if SIZEOF_INT > 4
04343 c &= 0xffffffff;
04344 #endif
04345 if (unicode_p) {
04346 if (c < 0x7F && ISPRINT(c)) {
04347 snprintf(buf, CHAR_ESC_LEN, "%c", c);
04348 }
04349 else if (c < 0x10000) {
04350 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
04351 }
04352 else {
04353 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
04354 }
04355 }
04356 else {
04357 if (c < 0x100) {
04358 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
04359 }
04360 else {
04361 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
04362 }
04363 }
04364 l = (int)strlen(buf);
04365 rb_str_buf_cat(result, buf, l);
04366 return l;
04367 }
04368
04369
04370
04371
04372
04373
04374
04375
04376
04377
04378
04379
04380
04381 VALUE
04382 rb_str_inspect(VALUE str)
04383 {
04384 rb_encoding *enc = STR_ENC_GET(str);
04385 const char *p, *pend, *prev;
04386 char buf[CHAR_ESC_LEN + 1];
04387 VALUE result = rb_str_buf_new(0);
04388 rb_encoding *resenc = rb_default_internal_encoding();
04389 int unicode_p = rb_enc_unicode_p(enc);
04390 int asciicompat = rb_enc_asciicompat(enc);
04391 static rb_encoding *utf16, *utf32;
04392
04393 if (!utf16) utf16 = rb_enc_find("UTF-16");
04394 if (!utf32) utf32 = rb_enc_find("UTF-32");
04395 if (resenc == NULL) resenc = rb_default_external_encoding();
04396 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
04397 rb_enc_associate(result, resenc);
04398 str_buf_cat2(result, "\"");
04399
04400 p = RSTRING_PTR(str); pend = RSTRING_END(str);
04401 prev = p;
04402 if (enc == utf16) {
04403 const unsigned char *q = (const unsigned char *)p;
04404 if (q[0] == 0xFE && q[1] == 0xFF)
04405 enc = rb_enc_find("UTF-16BE");
04406 else if (q[0] == 0xFF && q[1] == 0xFE)
04407 enc = rb_enc_find("UTF-16LE");
04408 else
04409 unicode_p = 0;
04410 }
04411 else if (enc == utf32) {
04412 const unsigned char *q = (const unsigned char *)p;
04413 if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
04414 enc = rb_enc_find("UTF-32BE");
04415 else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
04416 enc = rb_enc_find("UTF-32LE");
04417 else
04418 unicode_p = 0;
04419 }
04420 while (p < pend) {
04421 unsigned int c, cc;
04422 int n;
04423
04424 n = rb_enc_precise_mbclen(p, pend, enc);
04425 if (!MBCLEN_CHARFOUND_P(n)) {
04426 if (p > prev) str_buf_cat(result, prev, p - prev);
04427 n = rb_enc_mbminlen(enc);
04428 if (pend < p + n)
04429 n = (int)(pend - p);
04430 while (n--) {
04431 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
04432 str_buf_cat(result, buf, strlen(buf));
04433 prev = ++p;
04434 }
04435 continue;
04436 }
04437 n = MBCLEN_CHARFOUND_LEN(n);
04438 c = rb_enc_mbc_to_codepoint(p, pend, enc);
04439 p += n;
04440 if ((asciicompat || unicode_p) &&
04441 (c == '"'|| c == '\\' ||
04442 (c == '#' &&
04443 p < pend &&
04444 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
04445 (cc = rb_enc_codepoint(p,pend,enc),
04446 (cc == '$' || cc == '@' || cc == '{'))))) {
04447 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04448 str_buf_cat2(result, "\\");
04449 if (asciicompat || enc == resenc) {
04450 prev = p - n;
04451 continue;
04452 }
04453 }
04454 switch (c) {
04455 case '\n': cc = 'n'; break;
04456 case '\r': cc = 'r'; break;
04457 case '\t': cc = 't'; break;
04458 case '\f': cc = 'f'; break;
04459 case '\013': cc = 'v'; break;
04460 case '\010': cc = 'b'; break;
04461 case '\007': cc = 'a'; break;
04462 case 033: cc = 'e'; break;
04463 default: cc = 0; break;
04464 }
04465 if (cc) {
04466 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04467 buf[0] = '\\';
04468 buf[1] = (char)cc;
04469 str_buf_cat(result, buf, 2);
04470 prev = p;
04471 continue;
04472 }
04473 if ((enc == resenc && rb_enc_isprint(c, enc)) ||
04474 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
04475 continue;
04476 }
04477 else {
04478 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
04479 rb_str_buf_cat_escaped_char(result, c, unicode_p);
04480 prev = p;
04481 continue;
04482 }
04483 }
04484 if (p > prev) str_buf_cat(result, prev, p - prev);
04485 str_buf_cat2(result, "\"");
04486
04487 OBJ_INFECT(result, str);
04488 return result;
04489 }
04490
04491 #define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
04492
04493
04494
04495
04496
04497
04498
04499
04500
04501 VALUE
04502 rb_str_dump(VALUE str)
04503 {
04504 rb_encoding *enc = rb_enc_get(str);
04505 long len;
04506 const char *p, *pend;
04507 char *q, *qend;
04508 VALUE result;
04509 int u8 = (enc == rb_utf8_encoding());
04510
04511 len = 2;
04512 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04513 while (p < pend) {
04514 unsigned char c = *p++;
04515 switch (c) {
04516 case '"': case '\\':
04517 case '\n': case '\r':
04518 case '\t': case '\f':
04519 case '\013': case '\010': case '\007': case '\033':
04520 len += 2;
04521 break;
04522
04523 case '#':
04524 len += IS_EVSTR(p, pend) ? 2 : 1;
04525 break;
04526
04527 default:
04528 if (ISPRINT(c)) {
04529 len++;
04530 }
04531 else {
04532 if (u8) {
04533 int n = rb_enc_precise_mbclen(p-1, pend, enc);
04534 if (MBCLEN_CHARFOUND_P(n-1)) {
04535 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04536 while (cc >>= 4) len++;
04537 len += 5;
04538 p += MBCLEN_CHARFOUND_LEN(n)-1;
04539 break;
04540 }
04541 }
04542 len += 4;
04543 }
04544 break;
04545 }
04546 }
04547 if (!rb_enc_asciicompat(enc)) {
04548 len += 19;
04549 len += strlen(enc->name);
04550 }
04551
04552 result = rb_str_new5(str, 0, len);
04553 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
04554 q = RSTRING_PTR(result); qend = q + len + 1;
04555
04556 *q++ = '"';
04557 while (p < pend) {
04558 unsigned char c = *p++;
04559
04560 if (c == '"' || c == '\\') {
04561 *q++ = '\\';
04562 *q++ = c;
04563 }
04564 else if (c == '#') {
04565 if (IS_EVSTR(p, pend)) *q++ = '\\';
04566 *q++ = '#';
04567 }
04568 else if (c == '\n') {
04569 *q++ = '\\';
04570 *q++ = 'n';
04571 }
04572 else if (c == '\r') {
04573 *q++ = '\\';
04574 *q++ = 'r';
04575 }
04576 else if (c == '\t') {
04577 *q++ = '\\';
04578 *q++ = 't';
04579 }
04580 else if (c == '\f') {
04581 *q++ = '\\';
04582 *q++ = 'f';
04583 }
04584 else if (c == '\013') {
04585 *q++ = '\\';
04586 *q++ = 'v';
04587 }
04588 else if (c == '\010') {
04589 *q++ = '\\';
04590 *q++ = 'b';
04591 }
04592 else if (c == '\007') {
04593 *q++ = '\\';
04594 *q++ = 'a';
04595 }
04596 else if (c == '\033') {
04597 *q++ = '\\';
04598 *q++ = 'e';
04599 }
04600 else if (ISPRINT(c)) {
04601 *q++ = c;
04602 }
04603 else {
04604 *q++ = '\\';
04605 if (u8) {
04606 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
04607 if (MBCLEN_CHARFOUND_P(n)) {
04608 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
04609 p += n;
04610 snprintf(q, qend-q, "u{%x}", cc);
04611 q += strlen(q);
04612 continue;
04613 }
04614 }
04615 snprintf(q, qend-q, "x%02X", c);
04616 q += 3;
04617 }
04618 }
04619 *q++ = '"';
04620 *q = '\0';
04621 if (!rb_enc_asciicompat(enc)) {
04622 snprintf(q, qend-q, ".force_encoding(\"%s\")", enc->name);
04623 enc = rb_ascii8bit_encoding();
04624 }
04625 OBJ_INFECT(result, str);
04626
04627 rb_enc_associate(result, enc);
04628 ENC_CODERANGE_SET(result, ENC_CODERANGE_7BIT);
04629 return result;
04630 }
04631
04632
04633 static void
04634 rb_str_check_dummy_enc(rb_encoding *enc)
04635 {
04636 if (rb_enc_dummy_p(enc)) {
04637 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
04638 rb_enc_name(enc));
04639 }
04640 }
04641
04642
04643
04644
04645
04646
04647
04648
04649
04650
04651 static VALUE
04652 rb_str_upcase_bang(VALUE str)
04653 {
04654 rb_encoding *enc;
04655 char *s, *send;
04656 int modify = 0;
04657 int n;
04658
04659 str_modify_keep_cr(str);
04660 enc = STR_ENC_GET(str);
04661 rb_str_check_dummy_enc(enc);
04662 s = RSTRING_PTR(str); send = RSTRING_END(str);
04663 if (single_byte_optimizable(str)) {
04664 while (s < send) {
04665 unsigned int c = *(unsigned char*)s;
04666
04667 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04668 *s = 'A' + (c - 'a');
04669 modify = 1;
04670 }
04671 s++;
04672 }
04673 }
04674 else {
04675 int ascompat = rb_enc_asciicompat(enc);
04676
04677 while (s < send) {
04678 unsigned int c;
04679
04680 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04681 if (rb_enc_isascii(c, enc) && 'a' <= c && c <= 'z') {
04682 *s = 'A' + (c - 'a');
04683 modify = 1;
04684 }
04685 s++;
04686 }
04687 else {
04688 c = rb_enc_codepoint_len(s, send, &n, enc);
04689 if (rb_enc_islower(c, enc)) {
04690
04691 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04692 modify = 1;
04693 }
04694 s += n;
04695 }
04696 }
04697 }
04698
04699 if (modify) return str;
04700 return Qnil;
04701 }
04702
04703
04704
04705
04706
04707
04708
04709
04710
04711
04712
04713
04714
04715
04716 static VALUE
04717 rb_str_upcase(VALUE str)
04718 {
04719 str = rb_str_dup(str);
04720 rb_str_upcase_bang(str);
04721 return str;
04722 }
04723
04724
04725
04726
04727
04728
04729
04730
04731
04732
04733
04734 static VALUE
04735 rb_str_downcase_bang(VALUE str)
04736 {
04737 rb_encoding *enc;
04738 char *s, *send;
04739 int modify = 0;
04740
04741 str_modify_keep_cr(str);
04742 enc = STR_ENC_GET(str);
04743 rb_str_check_dummy_enc(enc);
04744 s = RSTRING_PTR(str); send = RSTRING_END(str);
04745 if (single_byte_optimizable(str)) {
04746 while (s < send) {
04747 unsigned int c = *(unsigned char*)s;
04748
04749 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04750 *s = 'a' + (c - 'A');
04751 modify = 1;
04752 }
04753 s++;
04754 }
04755 }
04756 else {
04757 int ascompat = rb_enc_asciicompat(enc);
04758
04759 while (s < send) {
04760 unsigned int c;
04761 int n;
04762
04763 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
04764 if (rb_enc_isascii(c, enc) && 'A' <= c && c <= 'Z') {
04765 *s = 'a' + (c - 'A');
04766 modify = 1;
04767 }
04768 s++;
04769 }
04770 else {
04771 c = rb_enc_codepoint_len(s, send, &n, enc);
04772 if (rb_enc_isupper(c, enc)) {
04773
04774 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04775 modify = 1;
04776 }
04777 s += n;
04778 }
04779 }
04780 }
04781
04782 if (modify) return str;
04783 return Qnil;
04784 }
04785
04786
04787
04788
04789
04790
04791
04792
04793
04794
04795
04796
04797
04798
04799 static VALUE
04800 rb_str_downcase(VALUE str)
04801 {
04802 str = rb_str_dup(str);
04803 rb_str_downcase_bang(str);
04804 return str;
04805 }
04806
04807
04808
04809
04810
04811
04812
04813
04814
04815
04816
04817
04818
04819
04820
04821
04822 static VALUE
04823 rb_str_capitalize_bang(VALUE str)
04824 {
04825 rb_encoding *enc;
04826 char *s, *send;
04827 int modify = 0;
04828 unsigned int c;
04829 int n;
04830
04831 str_modify_keep_cr(str);
04832 enc = STR_ENC_GET(str);
04833 rb_str_check_dummy_enc(enc);
04834 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
04835 s = RSTRING_PTR(str); send = RSTRING_END(str);
04836
04837 c = rb_enc_codepoint_len(s, send, &n, enc);
04838 if (rb_enc_islower(c, enc)) {
04839 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04840 modify = 1;
04841 }
04842 s += n;
04843 while (s < send) {
04844 c = rb_enc_codepoint_len(s, send, &n, enc);
04845 if (rb_enc_isupper(c, enc)) {
04846 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04847 modify = 1;
04848 }
04849 s += n;
04850 }
04851
04852 if (modify) return str;
04853 return Qnil;
04854 }
04855
04856
04857
04858
04859
04860
04861
04862
04863
04864
04865
04866
04867
04868
04869
04870 static VALUE
04871 rb_str_capitalize(VALUE str)
04872 {
04873 str = rb_str_dup(str);
04874 rb_str_capitalize_bang(str);
04875 return str;
04876 }
04877
04878
04879
04880
04881
04882
04883
04884
04885
04886
04887
04888 static VALUE
04889 rb_str_swapcase_bang(VALUE str)
04890 {
04891 rb_encoding *enc;
04892 char *s, *send;
04893 int modify = 0;
04894 int n;
04895
04896 str_modify_keep_cr(str);
04897 enc = STR_ENC_GET(str);
04898 rb_str_check_dummy_enc(enc);
04899 s = RSTRING_PTR(str); send = RSTRING_END(str);
04900 while (s < send) {
04901 unsigned int c = rb_enc_codepoint_len(s, send, &n, enc);
04902
04903 if (rb_enc_isupper(c, enc)) {
04904
04905 rb_enc_mbcput(rb_enc_tolower(c, enc), s, enc);
04906 modify = 1;
04907 }
04908 else if (rb_enc_islower(c, enc)) {
04909
04910 rb_enc_mbcput(rb_enc_toupper(c, enc), s, enc);
04911 modify = 1;
04912 }
04913 s += n;
04914 }
04915
04916 if (modify) return str;
04917 return Qnil;
04918 }
04919
04920
04921
04922
04923
04924
04925
04926
04927
04928
04929
04930
04931
04932
04933 static VALUE
04934 rb_str_swapcase(VALUE str)
04935 {
04936 str = rb_str_dup(str);
04937 rb_str_swapcase_bang(str);
04938 return str;
04939 }
04940
04941 typedef unsigned char *USTR;
04942
04943 struct tr {
04944 int gen;
04945 unsigned int now, max;
04946 char *p, *pend;
04947 };
04948
04949 static unsigned int
04950 trnext(struct tr *t, rb_encoding *enc)
04951 {
04952 int n;
04953
04954 for (;;) {
04955 if (!t->gen) {
04956 if (t->p == t->pend) return -1;
04957 if (t->p < t->pend - 1 && *t->p == '\\') {
04958 t->p++;
04959 }
04960 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04961 t->p += n;
04962 if (t->p < t->pend - 1 && *t->p == '-') {
04963 t->p++;
04964 if (t->p < t->pend) {
04965 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
04966 t->p += n;
04967 if (t->now > c) {
04968 if (t->now < 0x80 && c < 0x80) {
04969 rb_raise(rb_eArgError,
04970 "invalid range \"%c-%c\" in string transliteration",
04971 t->now, c);
04972 }
04973 else {
04974 rb_raise(rb_eArgError, "invalid range in string transliteration");
04975 }
04976 continue;
04977 }
04978 t->gen = 1;
04979 t->max = c;
04980 }
04981 }
04982 return t->now;
04983 }
04984 else if (++t->now < t->max) {
04985 return t->now;
04986 }
04987 else {
04988 t->gen = 0;
04989 return t->max;
04990 }
04991 }
04992 }
04993
04994 static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
04995
04996 static VALUE
04997 tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
04998 {
04999 const unsigned int errc = -1;
05000 unsigned int trans[256];
05001 rb_encoding *enc, *e1, *e2;
05002 struct tr trsrc, trrepl;
05003 int cflag = 0;
05004 unsigned int c, c0, last = 0;
05005 int modify = 0, i, l;
05006 char *s, *send;
05007 VALUE hash = 0;
05008 int singlebyte = single_byte_optimizable(str);
05009 int cr;
05010
05011 #define CHECK_IF_ASCII(c) \
05012 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
05013 (cr = ENC_CODERANGE_VALID) : 0)
05014
05015 StringValue(src);
05016 StringValue(repl);
05017 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05018 if (RSTRING_LEN(repl) == 0) {
05019 return rb_str_delete_bang(1, &src, str);
05020 }
05021
05022 cr = ENC_CODERANGE(str);
05023 e1 = rb_enc_check(str, src);
05024 e2 = rb_enc_check(str, repl);
05025 if (e1 == e2) {
05026 enc = e1;
05027 }
05028 else {
05029 enc = rb_enc_check(src, repl);
05030 }
05031 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
05032 if (RSTRING_LEN(src) > 1 &&
05033 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
05034 trsrc.p + l < trsrc.pend) {
05035 cflag = 1;
05036 trsrc.p += l;
05037 }
05038 trrepl.p = RSTRING_PTR(repl);
05039 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
05040 trsrc.gen = trrepl.gen = 0;
05041 trsrc.now = trrepl.now = 0;
05042 trsrc.max = trrepl.max = 0;
05043
05044 if (cflag) {
05045 for (i=0; i<256; i++) {
05046 trans[i] = 1;
05047 }
05048 while ((c = trnext(&trsrc, enc)) != errc) {
05049 if (c < 256) {
05050 trans[c] = errc;
05051 }
05052 else {
05053 if (!hash) hash = rb_hash_new();
05054 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
05055 }
05056 }
05057 while ((c = trnext(&trrepl, enc)) != errc)
05058 ;
05059 last = trrepl.now;
05060 for (i=0; i<256; i++) {
05061 if (trans[i] != errc) {
05062 trans[i] = last;
05063 }
05064 }
05065 }
05066 else {
05067 unsigned int r;
05068
05069 for (i=0; i<256; i++) {
05070 trans[i] = errc;
05071 }
05072 while ((c = trnext(&trsrc, enc)) != errc) {
05073 r = trnext(&trrepl, enc);
05074 if (r == errc) r = trrepl.now;
05075 if (c < 256) {
05076 trans[c] = r;
05077 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
05078 }
05079 else {
05080 if (!hash) hash = rb_hash_new();
05081 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
05082 }
05083 }
05084 }
05085
05086 if (cr == ENC_CODERANGE_VALID)
05087 cr = ENC_CODERANGE_7BIT;
05088 str_modify_keep_cr(str);
05089 s = RSTRING_PTR(str); send = RSTRING_END(str);
05090 if (sflag) {
05091 int clen, tlen;
05092 long offset, max = RSTRING_LEN(str);
05093 unsigned int save = -1;
05094 char *buf = ALLOC_N(char, max), *t = buf;
05095
05096 while (s < send) {
05097 int may_modify = 0;
05098
05099 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05100 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05101
05102 s += clen;
05103 if (c < 256) {
05104 c = trans[c];
05105 }
05106 else if (hash) {
05107 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05108 if (NIL_P(tmp)) {
05109 if (cflag) c = last;
05110 else c = errc;
05111 }
05112 else if (cflag) c = errc;
05113 else c = NUM2INT(tmp);
05114 }
05115 else {
05116 c = errc;
05117 }
05118 if (c != (unsigned int)-1) {
05119 if (save == c) {
05120 CHECK_IF_ASCII(c);
05121 continue;
05122 }
05123 save = c;
05124 tlen = rb_enc_codelen(c, enc);
05125 modify = 1;
05126 }
05127 else {
05128 save = -1;
05129 c = c0;
05130 if (enc != e1) may_modify = 1;
05131 }
05132 while (t - buf + tlen >= max) {
05133 offset = t - buf;
05134 max *= 2;
05135 REALLOC_N(buf, char, max);
05136 t = buf + offset;
05137 }
05138 rb_enc_mbcput(c, t, enc);
05139 if (may_modify && memcmp(s, t, tlen) != 0) {
05140 modify = 1;
05141 }
05142 CHECK_IF_ASCII(c);
05143 t += tlen;
05144 }
05145 if (!STR_EMBED_P(str)) {
05146 xfree(RSTRING(str)->as.heap.ptr);
05147 }
05148 *t = '\0';
05149 RSTRING(str)->as.heap.ptr = buf;
05150 RSTRING(str)->as.heap.len = t - buf;
05151 STR_SET_NOEMBED(str);
05152 RSTRING(str)->as.heap.aux.capa = max;
05153 }
05154 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
05155 while (s < send) {
05156 c = (unsigned char)*s;
05157 if (trans[c] != errc) {
05158 if (!cflag) {
05159 c = trans[c];
05160 *s = c;
05161 modify = 1;
05162 }
05163 else {
05164 *s = last;
05165 modify = 1;
05166 }
05167 }
05168 CHECK_IF_ASCII(c);
05169 s++;
05170 }
05171 }
05172 else {
05173 int clen, tlen, max = (int)(RSTRING_LEN(str) * 1.2);
05174 long offset;
05175 char *buf = ALLOC_N(char, max), *t = buf;
05176
05177 while (s < send) {
05178 int may_modify = 0;
05179 c0 = c = rb_enc_codepoint_len(s, send, &clen, e1);
05180 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
05181
05182 if (c < 256) {
05183 c = trans[c];
05184 }
05185 else if (hash) {
05186 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
05187 if (NIL_P(tmp)) {
05188 if (cflag) c = last;
05189 else c = errc;
05190 }
05191 else if (cflag) c = errc;
05192 else c = NUM2INT(tmp);
05193 }
05194 else {
05195 c = cflag ? last : errc;
05196 }
05197 if (c != errc) {
05198 tlen = rb_enc_codelen(c, enc);
05199 modify = 1;
05200 }
05201 else {
05202 c = c0;
05203 if (enc != e1) may_modify = 1;
05204 }
05205 while (t - buf + tlen >= max) {
05206 offset = t - buf;
05207 max *= 2;
05208 REALLOC_N(buf, char, max);
05209 t = buf + offset;
05210 }
05211 if (s != t) {
05212 rb_enc_mbcput(c, t, enc);
05213 if (may_modify && memcmp(s, t, tlen) != 0) {
05214 modify = 1;
05215 }
05216 }
05217 CHECK_IF_ASCII(c);
05218 s += clen;
05219 t += tlen;
05220 }
05221 if (!STR_EMBED_P(str)) {
05222 xfree(RSTRING(str)->as.heap.ptr);
05223 }
05224 *t = '\0';
05225 RSTRING(str)->as.heap.ptr = buf;
05226 RSTRING(str)->as.heap.len = t - buf;
05227 STR_SET_NOEMBED(str);
05228 RSTRING(str)->as.heap.aux.capa = max;
05229 }
05230
05231 if (modify) {
05232 if (cr != ENC_CODERANGE_BROKEN)
05233 ENC_CODERANGE_SET(str, cr);
05234 rb_enc_associate(str, enc);
05235 return str;
05236 }
05237 return Qnil;
05238 }
05239
05240
05241
05242
05243
05244
05245
05246
05247
05248
05249
05250 static VALUE
05251 rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
05252 {
05253 return tr_trans(str, src, repl, 0);
05254 }
05255
05256
05257
05258
05259
05260
05261
05262
05263
05264
05265
05266
05267
05268
05269
05270
05271
05272
05273
05274
05275
05276
05277 static VALUE
05278 rb_str_tr(VALUE str, VALUE src, VALUE repl)
05279 {
05280 str = rb_str_dup(str);
05281 tr_trans(str, src, repl, 0);
05282 return str;
05283 }
05284
05285 #define TR_TABLE_SIZE 257
05286 static void
05287 tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
05288 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
05289 {
05290 const unsigned int errc = -1;
05291 char buf[256];
05292 struct tr tr;
05293 unsigned int c;
05294 VALUE table = 0, ptable = 0;
05295 int i, l, cflag = 0;
05296
05297 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
05298 tr.gen = tr.now = tr.max = 0;
05299
05300 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
05301 cflag = 1;
05302 tr.p += l;
05303 }
05304 if (first) {
05305 for (i=0; i<256; i++) {
05306 stable[i] = 1;
05307 }
05308 stable[256] = cflag;
05309 }
05310 else if (stable[256] && !cflag) {
05311 stable[256] = 0;
05312 }
05313 for (i=0; i<256; i++) {
05314 buf[i] = cflag;
05315 }
05316
05317 while ((c = trnext(&tr, enc)) != errc) {
05318 if (c < 256) {
05319 buf[c & 0xff] = !cflag;
05320 }
05321 else {
05322 VALUE key = UINT2NUM(c);
05323
05324 if (!table) {
05325 table = rb_hash_new();
05326 if (cflag) {
05327 ptable = *ctablep;
05328 *ctablep = table;
05329 }
05330 else {
05331 ptable = *tablep;
05332 *tablep = table;
05333 }
05334 }
05335 if (!ptable || !NIL_P(rb_hash_aref(ptable, key))) {
05336 rb_hash_aset(table, key, Qtrue);
05337 }
05338 }
05339 }
05340 for (i=0; i<256; i++) {
05341 stable[i] = stable[i] && buf[i];
05342 }
05343 }
05344
05345
05346 static int
05347 tr_find(unsigned int c, char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
05348 {
05349 if (c < 256) {
05350 return table[c] != 0;
05351 }
05352 else {
05353 VALUE v = UINT2NUM(c);
05354
05355 if (del) {
05356 if (!NIL_P(rb_hash_lookup(del, v)) &&
05357 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
05358 return TRUE;
05359 }
05360 }
05361 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
05362 return FALSE;
05363 }
05364 return table[256] ? TRUE : FALSE;
05365 }
05366 }
05367
05368
05369
05370
05371
05372
05373
05374
05375
05376 static VALUE
05377 rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
05378 {
05379 char squeez[TR_TABLE_SIZE];
05380 rb_encoding *enc = 0;
05381 char *s, *send, *t;
05382 VALUE del = 0, nodel = 0;
05383 int modify = 0;
05384 int i, ascompat, cr;
05385
05386 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
05387 if (argc < 1) {
05388 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05389 }
05390 for (i=0; i<argc; i++) {
05391 VALUE s = argv[i];
05392
05393 StringValue(s);
05394 enc = rb_enc_check(str, s);
05395 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05396 }
05397
05398 str_modify_keep_cr(str);
05399 ascompat = rb_enc_asciicompat(enc);
05400 s = t = RSTRING_PTR(str);
05401 send = RSTRING_END(str);
05402 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
05403 while (s < send) {
05404 unsigned int c;
05405 int clen;
05406
05407 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05408 if (squeez[c]) {
05409 modify = 1;
05410 }
05411 else {
05412 if (t != s) *t = c;
05413 t++;
05414 }
05415 s++;
05416 }
05417 else {
05418 c = rb_enc_codepoint_len(s, send, &clen, enc);
05419
05420 if (tr_find(c, squeez, del, nodel)) {
05421 modify = 1;
05422 }
05423 else {
05424 if (t != s) rb_enc_mbcput(c, t, enc);
05425 t += clen;
05426 if (cr == ENC_CODERANGE_7BIT) cr = ENC_CODERANGE_VALID;
05427 }
05428 s += clen;
05429 }
05430 }
05431 *t = '\0';
05432 STR_SET_LEN(str, t - RSTRING_PTR(str));
05433 ENC_CODERANGE_SET(str, cr);
05434
05435 if (modify) return str;
05436 return Qnil;
05437 }
05438
05439
05440
05441
05442
05443
05444
05445
05446
05447
05448
05449
05450
05451
05452
05453
05454 static VALUE
05455 rb_str_delete(int argc, VALUE *argv, VALUE str)
05456 {
05457 str = rb_str_dup(str);
05458 rb_str_delete_bang(argc, argv, str);
05459 return str;
05460 }
05461
05462
05463
05464
05465
05466
05467
05468
05469
05470
05471 static VALUE
05472 rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
05473 {
05474 char squeez[TR_TABLE_SIZE];
05475 rb_encoding *enc = 0;
05476 VALUE del = 0, nodel = 0;
05477 char *s, *send, *t;
05478 int i, modify = 0;
05479 int ascompat, singlebyte = single_byte_optimizable(str);
05480 unsigned int save;
05481
05482 if (argc == 0) {
05483 enc = STR_ENC_GET(str);
05484 }
05485 else {
05486 for (i=0; i<argc; i++) {
05487 VALUE s = argv[i];
05488
05489 StringValue(s);
05490 enc = rb_enc_check(str, s);
05491 if (singlebyte && !single_byte_optimizable(s))
05492 singlebyte = 0;
05493 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
05494 }
05495 }
05496
05497 str_modify_keep_cr(str);
05498 s = t = RSTRING_PTR(str);
05499 if (!s || RSTRING_LEN(str) == 0) return Qnil;
05500 send = RSTRING_END(str);
05501 save = -1;
05502 ascompat = rb_enc_asciicompat(enc);
05503
05504 if (singlebyte) {
05505 while (s < send) {
05506 unsigned int c = *(unsigned char*)s++;
05507 if (c != save || (argc > 0 && !squeez[c])) {
05508 *t++ = save = c;
05509 }
05510 }
05511 } else {
05512 while (s < send) {
05513 unsigned int c;
05514 int clen;
05515
05516 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05517 if (c != save || (argc > 0 && !squeez[c])) {
05518 *t++ = save = c;
05519 }
05520 s++;
05521 }
05522 else {
05523 c = rb_enc_codepoint_len(s, send, &clen, enc);
05524
05525 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
05526 if (t != s) rb_enc_mbcput(c, t, enc);
05527 save = c;
05528 t += clen;
05529 }
05530 s += clen;
05531 }
05532 }
05533 }
05534
05535 *t = '\0';
05536 if (t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
05537 STR_SET_LEN(str, t - RSTRING_PTR(str));
05538 modify = 1;
05539 }
05540
05541 if (modify) return str;
05542 return Qnil;
05543 }
05544
05545
05546
05547
05548
05549
05550
05551
05552
05553
05554
05555
05556
05557
05558
05559
05560
05561 static VALUE
05562 rb_str_squeeze(int argc, VALUE *argv, VALUE str)
05563 {
05564 str = rb_str_dup(str);
05565 rb_str_squeeze_bang(argc, argv, str);
05566 return str;
05567 }
05568
05569
05570
05571
05572
05573
05574
05575
05576
05577
05578 static VALUE
05579 rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
05580 {
05581 return tr_trans(str, src, repl, 1);
05582 }
05583
05584
05585
05586
05587
05588
05589
05590
05591
05592
05593
05594
05595
05596
05597
05598 static VALUE
05599 rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
05600 {
05601 str = rb_str_dup(str);
05602 tr_trans(str, src, repl, 1);
05603 return str;
05604 }
05605
05606
05607
05608
05609
05610
05611
05612
05613
05614
05615
05616
05617
05618
05619
05620
05621
05622
05623 static VALUE
05624 rb_str_count(int argc, VALUE *argv, VALUE str)
05625 {
05626 char table[TR_TABLE_SIZE];
05627 rb_encoding *enc = 0;
05628 VALUE del = 0, nodel = 0;
05629 char *s, *send;
05630 int i;
05631 int ascompat;
05632
05633 if (argc < 1) {
05634 rb_raise(rb_eArgError, "wrong number of arguments (at least 1)");
05635 }
05636 for (i=0; i<argc; i++) {
05637 VALUE tstr = argv[i];
05638 unsigned char c;
05639
05640 StringValue(tstr);
05641 enc = rb_enc_check(str, tstr);
05642 if (argc == 1 && RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
05643 (c = RSTRING_PTR(tstr)[0]) < 0x80 && !is_broken_string(str)) {
05644 int n = 0;
05645
05646 s = RSTRING_PTR(str);
05647 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05648 send = RSTRING_END(str);
05649 while (s < send) {
05650 if (*(unsigned char*)s++ == c) n++;
05651 }
05652 return INT2NUM(n);
05653 }
05654 tr_setup_table(tstr, table, i==0, &del, &nodel, enc);
05655 }
05656
05657 s = RSTRING_PTR(str);
05658 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
05659 send = RSTRING_END(str);
05660 ascompat = rb_enc_asciicompat(enc);
05661 i = 0;
05662 while (s < send) {
05663 unsigned int c;
05664
05665 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
05666 if (table[c]) {
05667 i++;
05668 }
05669 s++;
05670 }
05671 else {
05672 int clen;
05673 c = rb_enc_codepoint_len(s, send, &clen, enc);
05674 if (tr_find(c, table, del, nodel)) {
05675 i++;
05676 }
05677 s += clen;
05678 }
05679 }
05680
05681 return INT2NUM(i);
05682 }
05683
05684 static const char isspacetable[256] = {
05685 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
05686 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05687 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05688 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05689 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05690 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05691 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05692 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05693 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05694 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05695 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05696 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05697 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05698 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05699 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
05700 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
05701 };
05702
05703 #define ascii_isspace(c) isspacetable[(unsigned char)(c)]
05704
05705
05706
05707
05708
05709
05710
05711
05712
05713
05714
05715
05716
05717
05718
05719
05720
05721
05722
05723
05724
05725
05726
05727
05728
05729
05730
05731
05732
05733
05734
05735
05736
05737
05738
05739
05740
05741
05742
05743
05744
05745
05746
05747 static VALUE
05748 rb_str_split_m(int argc, VALUE *argv, VALUE str)
05749 {
05750 rb_encoding *enc;
05751 VALUE spat;
05752 VALUE limit;
05753 enum {awk, string, regexp} split_type;
05754 long beg, end, i = 0;
05755 int lim = 0;
05756 VALUE result, tmp;
05757
05758 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
05759 lim = NUM2INT(limit);
05760 if (lim <= 0) limit = Qnil;
05761 else if (lim == 1) {
05762 if (RSTRING_LEN(str) == 0)
05763 return rb_ary_new2(0);
05764 return rb_ary_new3(1, str);
05765 }
05766 i = 1;
05767 }
05768
05769 enc = STR_ENC_GET(str);
05770 if (NIL_P(spat)) {
05771 if (!NIL_P(rb_fs)) {
05772 spat = rb_fs;
05773 goto fs_set;
05774 }
05775 split_type = awk;
05776 }
05777 else {
05778 fs_set:
05779 if (TYPE(spat) == T_STRING) {
05780 rb_encoding *enc2 = STR_ENC_GET(spat);
05781
05782 split_type = string;
05783 if (RSTRING_LEN(spat) == 0) {
05784
05785 spat = rb_reg_regcomp(spat);
05786 split_type = regexp;
05787 }
05788 else if (rb_enc_asciicompat(enc2) == 1) {
05789 if (RSTRING_LEN(spat) == 1 && RSTRING_PTR(spat)[0] == ' '){
05790 split_type = awk;
05791 }
05792 }
05793 else {
05794 int l;
05795 if (rb_enc_ascget(RSTRING_PTR(spat), RSTRING_END(spat), &l, enc2) == ' ' &&
05796 RSTRING_LEN(spat) == l) {
05797 split_type = awk;
05798 }
05799 }
05800 }
05801 else {
05802 spat = get_pat(spat, 1);
05803 split_type = regexp;
05804 }
05805 }
05806
05807 result = rb_ary_new();
05808 beg = 0;
05809 if (split_type == awk) {
05810 char *ptr = RSTRING_PTR(str);
05811 char *eptr = RSTRING_END(str);
05812 char *bptr = ptr;
05813 int skip = 1;
05814 unsigned int c;
05815
05816 end = beg;
05817 if (is_ascii_string(str)) {
05818 while (ptr < eptr) {
05819 c = (unsigned char)*ptr++;
05820 if (skip) {
05821 if (ascii_isspace(c)) {
05822 beg = ptr - bptr;
05823 }
05824 else {
05825 end = ptr - bptr;
05826 skip = 0;
05827 if (!NIL_P(limit) && lim <= i) break;
05828 }
05829 }
05830 else if (ascii_isspace(c)) {
05831 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05832 skip = 1;
05833 beg = ptr - bptr;
05834 if (!NIL_P(limit)) ++i;
05835 }
05836 else {
05837 end = ptr - bptr;
05838 }
05839 }
05840 }
05841 else {
05842 while (ptr < eptr) {
05843 int n;
05844
05845 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
05846 ptr += n;
05847 if (skip) {
05848 if (rb_isspace(c)) {
05849 beg = ptr - bptr;
05850 }
05851 else {
05852 end = ptr - bptr;
05853 skip = 0;
05854 if (!NIL_P(limit) && lim <= i) break;
05855 }
05856 }
05857 else if (rb_isspace(c)) {
05858 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05859 skip = 1;
05860 beg = ptr - bptr;
05861 if (!NIL_P(limit)) ++i;
05862 }
05863 else {
05864 end = ptr - bptr;
05865 }
05866 }
05867 }
05868 }
05869 else if (split_type == string) {
05870 char *ptr = RSTRING_PTR(str);
05871 char *temp = ptr;
05872 char *eptr = RSTRING_END(str);
05873 char *sptr = RSTRING_PTR(spat);
05874 long slen = RSTRING_LEN(spat);
05875
05876 if (is_broken_string(str)) {
05877 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
05878 }
05879 if (is_broken_string(spat)) {
05880 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(spat)));
05881 }
05882 enc = rb_enc_check(str, spat);
05883 while (ptr < eptr &&
05884 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
05885
05886 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
05887 if (t != ptr + end) {
05888 ptr = t;
05889 continue;
05890 }
05891 rb_ary_push(result, rb_str_subseq(str, ptr - temp, end));
05892 ptr += end + slen;
05893 if (!NIL_P(limit) && lim <= ++i) break;
05894 }
05895 beg = ptr - temp;
05896 }
05897 else {
05898 char *ptr = RSTRING_PTR(str);
05899 long len = RSTRING_LEN(str);
05900 long start = beg;
05901 long idx;
05902 int last_null = 0;
05903 struct re_registers *regs;
05904
05905 while ((end = rb_reg_search(spat, str, start, 0)) >= 0) {
05906 regs = RMATCH_REGS(rb_backref_get());
05907 if (start == end && BEG(0) == END(0)) {
05908 if (!ptr) {
05909 rb_ary_push(result, str_new_empty(str));
05910 break;
05911 }
05912 else if (last_null == 1) {
05913 rb_ary_push(result, rb_str_subseq(str, beg,
05914 rb_enc_fast_mbclen(ptr+beg,
05915 ptr+len,
05916 enc)));
05917 beg = start;
05918 }
05919 else {
05920 if (ptr+start == ptr+len)
05921 start++;
05922 else
05923 start += rb_enc_fast_mbclen(ptr+start,ptr+len,enc);
05924 last_null = 1;
05925 continue;
05926 }
05927 }
05928 else {
05929 rb_ary_push(result, rb_str_subseq(str, beg, end-beg));
05930 beg = start = END(0);
05931 }
05932 last_null = 0;
05933
05934 for (idx=1; idx < regs->num_regs; idx++) {
05935 if (BEG(idx) == -1) continue;
05936 if (BEG(idx) == END(idx))
05937 tmp = str_new_empty(str);
05938 else
05939 tmp = rb_str_subseq(str, BEG(idx), END(idx)-BEG(idx));
05940 rb_ary_push(result, tmp);
05941 }
05942 if (!NIL_P(limit) && lim <= ++i) break;
05943 }
05944 }
05945 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
05946 if (RSTRING_LEN(str) == beg)
05947 tmp = str_new_empty(str);
05948 else
05949 tmp = rb_str_subseq(str, beg, RSTRING_LEN(str)-beg);
05950 rb_ary_push(result, tmp);
05951 }
05952 if (NIL_P(limit) && lim == 0) {
05953 long len;
05954 while ((len = RARRAY_LEN(result)) > 0 &&
05955 (tmp = RARRAY_PTR(result)[len-1], RSTRING_LEN(tmp) == 0))
05956 rb_ary_pop(result);
05957 }
05958
05959 return result;
05960 }
05961
05962 VALUE
05963 rb_str_split(VALUE str, const char *sep0)
05964 {
05965 VALUE sep;
05966
05967 StringValue(str);
05968 sep = rb_str_new2(sep0);
05969 return rb_str_split_m(1, &sep, str);
05970 }
05971
05972
05973
05974
05975
05976
05977
05978
05979
05980
05981
05982
05983
05984
05985
05986
05987
05988
05989
05990
05991
05992
05993
05994
05995
05996
05997
05998
05999
06000
06001
06002
06003
06004
06005
06006
06007
06008
06009
06010 static VALUE
06011 rb_str_each_line(int argc, VALUE *argv, VALUE str)
06012 {
06013 rb_encoding *enc;
06014 VALUE rs;
06015 unsigned int newline;
06016 const char *p, *pend, *s, *ptr;
06017 long len, rslen;
06018 VALUE line;
06019 int n;
06020 VALUE orig = str;
06021
06022 if (argc == 0) {
06023 rs = rb_rs;
06024 }
06025 else {
06026 rb_scan_args(argc, argv, "01", &rs);
06027 }
06028 RETURN_ENUMERATOR(str, argc, argv);
06029 if (NIL_P(rs)) {
06030 rb_yield(str);
06031 return orig;
06032 }
06033 str = rb_str_new4(str);
06034 ptr = p = s = RSTRING_PTR(str);
06035 pend = p + RSTRING_LEN(str);
06036 len = RSTRING_LEN(str);
06037 StringValue(rs);
06038 if (rs == rb_default_rs) {
06039 enc = rb_enc_get(str);
06040 while (p < pend) {
06041 char *p0;
06042
06043 p = memchr(p, '\n', pend - p);
06044 if (!p) break;
06045 p0 = rb_enc_left_char_head(s, p, pend, enc);
06046 if (!rb_enc_is_newline(p0, pend, enc)) {
06047 p++;
06048 continue;
06049 }
06050 p = p0 + rb_enc_mbclen(p0, pend, enc);
06051 line = rb_str_new5(str, s, p - s);
06052 OBJ_INFECT(line, str);
06053 rb_enc_cr_str_copy_for_substr(line, str);
06054 rb_yield(line);
06055 str_mod_check(str, ptr, len);
06056 s = p;
06057 }
06058 goto finish;
06059 }
06060
06061 enc = rb_enc_check(str, rs);
06062 rslen = RSTRING_LEN(rs);
06063 if (rslen == 0) {
06064 newline = '\n';
06065 }
06066 else {
06067 newline = rb_enc_codepoint(RSTRING_PTR(rs), RSTRING_END(rs), enc);
06068 }
06069
06070 while (p < pend) {
06071 unsigned int c = rb_enc_codepoint_len(p, pend, &n, enc);
06072
06073 again:
06074 if (rslen == 0 && c == newline) {
06075 p += n;
06076 if (p < pend && (c = rb_enc_codepoint_len(p, pend, &n, enc)) != newline) {
06077 goto again;
06078 }
06079 while (p < pend && rb_enc_codepoint(p, pend, enc) == newline) {
06080 p += n;
06081 }
06082 p -= n;
06083 }
06084 if (c == newline &&
06085 (rslen <= 1 ||
06086 (pend - p >= rslen && memcmp(RSTRING_PTR(rs), p, rslen) == 0))) {
06087 line = rb_str_new5(str, s, p - s + (rslen ? rslen : n));
06088 OBJ_INFECT(line, str);
06089 rb_enc_cr_str_copy_for_substr(line, str);
06090 rb_yield(line);
06091 str_mod_check(str, ptr, len);
06092 s = p + (rslen ? rslen : n);
06093 }
06094 p += n;
06095 }
06096
06097 finish:
06098 if (s != pend) {
06099 line = rb_str_new5(str, s, pend - s);
06100 OBJ_INFECT(line, str);
06101 rb_enc_cr_str_copy_for_substr(line, str);
06102 rb_yield(line);
06103 }
06104
06105 return orig;
06106 }
06107
06108
06109
06110
06111
06112
06113
06114
06115
06116
06117
06118
06119
06120
06121
06122
06123
06124
06125
06126
06127 static VALUE
06128 rb_str_each_byte(VALUE str)
06129 {
06130 long i;
06131
06132 RETURN_ENUMERATOR(str, 0, 0);
06133 for (i=0; i<RSTRING_LEN(str); i++) {
06134 rb_yield(INT2FIX(RSTRING_PTR(str)[i] & 0xff));
06135 }
06136 return str;
06137 }
06138
06139
06140
06141
06142
06143
06144
06145
06146
06147
06148
06149
06150
06151
06152
06153
06154
06155
06156
06157
06158 static VALUE
06159 rb_str_each_char(VALUE str)
06160 {
06161 VALUE orig = str;
06162 long i, len, n;
06163 const char *ptr;
06164 rb_encoding *enc;
06165
06166 RETURN_ENUMERATOR(str, 0, 0);
06167 str = rb_str_new4(str);
06168 ptr = RSTRING_PTR(str);
06169 len = RSTRING_LEN(str);
06170 enc = rb_enc_get(str);
06171 switch (ENC_CODERANGE(str)) {
06172 case ENC_CODERANGE_VALID:
06173 case ENC_CODERANGE_7BIT:
06174 for (i = 0; i < len; i += n) {
06175 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
06176 rb_yield(rb_str_subseq(str, i, n));
06177 }
06178 break;
06179 default:
06180 for (i = 0; i < len; i += n) {
06181 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
06182 rb_yield(rb_str_subseq(str, i, n));
06183 }
06184 }
06185 return orig;
06186 }
06187
06188
06189
06190
06191
06192
06193
06194
06195
06196
06197
06198
06199
06200
06201
06202
06203
06204
06205
06206
06207
06208
06209 static VALUE
06210 rb_str_each_codepoint(VALUE str)
06211 {
06212 VALUE orig = str;
06213 int n;
06214 unsigned int c;
06215 const char *ptr, *end;
06216 rb_encoding *enc;
06217
06218 if (single_byte_optimizable(str)) return rb_str_each_byte(str);
06219 RETURN_ENUMERATOR(str, 0, 0);
06220 str = rb_str_new4(str);
06221 ptr = RSTRING_PTR(str);
06222 end = RSTRING_END(str);
06223 enc = STR_ENC_GET(str);
06224 while (ptr < end) {
06225 c = rb_enc_codepoint_len(ptr, end, &n, enc);
06226 rb_yield(UINT2NUM(c));
06227 ptr += n;
06228 }
06229 return orig;
06230 }
06231
06232 static long
06233 chopped_length(VALUE str)
06234 {
06235 rb_encoding *enc = STR_ENC_GET(str);
06236 const char *p, *p2, *beg, *end;
06237
06238 beg = RSTRING_PTR(str);
06239 end = beg + RSTRING_LEN(str);
06240 if (beg > end) return 0;
06241 p = rb_enc_prev_char(beg, end, end, enc);
06242 if (!p) return 0;
06243 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
06244 p2 = rb_enc_prev_char(beg, p, end, enc);
06245 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
06246 }
06247 return p - beg;
06248 }
06249
06250
06251
06252
06253
06254
06255
06256
06257
06258
06259 static VALUE
06260 rb_str_chop_bang(VALUE str)
06261 {
06262 str_modify_keep_cr(str);
06263 if (RSTRING_LEN(str) > 0) {
06264 long len;
06265 len = chopped_length(str);
06266 STR_SET_LEN(str, len);
06267 RSTRING_PTR(str)[len] = '\0';
06268 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06269 ENC_CODERANGE_CLEAR(str);
06270 }
06271 return str;
06272 }
06273 return Qnil;
06274 }
06275
06276
06277
06278
06279
06280
06281
06282
06283
06284
06285
06286
06287
06288
06289
06290
06291
06292
06293
06294 static VALUE
06295 rb_str_chop(VALUE str)
06296 {
06297 VALUE str2 = rb_str_new5(str, RSTRING_PTR(str), chopped_length(str));
06298 rb_enc_cr_str_copy_for_substr(str2, str);
06299 OBJ_INFECT(str2, str);
06300 return str2;
06301 }
06302
06303
06304
06305
06306
06307
06308
06309
06310
06311
06312 static VALUE
06313 rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
06314 {
06315 rb_encoding *enc;
06316 VALUE rs;
06317 int newline;
06318 char *p, *pp, *e;
06319 long len, rslen;
06320
06321 str_modify_keep_cr(str);
06322 len = RSTRING_LEN(str);
06323 if (len == 0) return Qnil;
06324 p = RSTRING_PTR(str);
06325 e = p + len;
06326 if (argc == 0) {
06327 rs = rb_rs;
06328 if (rs == rb_default_rs) {
06329 smart_chomp:
06330 enc = rb_enc_get(str);
06331 if (rb_enc_mbminlen(enc) > 1) {
06332 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
06333 if (rb_enc_is_newline(pp, e, enc)) {
06334 e = pp;
06335 }
06336 pp = e - rb_enc_mbminlen(enc);
06337 if (pp >= p) {
06338 pp = rb_enc_left_char_head(p, pp, e, enc);
06339 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
06340 e = pp;
06341 }
06342 }
06343 if (e == RSTRING_END(str)) {
06344 return Qnil;
06345 }
06346 len = e - RSTRING_PTR(str);
06347 STR_SET_LEN(str, len);
06348 }
06349 else {
06350 if (RSTRING_PTR(str)[len-1] == '\n') {
06351 STR_DEC_LEN(str);
06352 if (RSTRING_LEN(str) > 0 &&
06353 RSTRING_PTR(str)[RSTRING_LEN(str)-1] == '\r') {
06354 STR_DEC_LEN(str);
06355 }
06356 }
06357 else if (RSTRING_PTR(str)[len-1] == '\r') {
06358 STR_DEC_LEN(str);
06359 }
06360 else {
06361 return Qnil;
06362 }
06363 }
06364 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06365 return str;
06366 }
06367 }
06368 else {
06369 rb_scan_args(argc, argv, "01", &rs);
06370 }
06371 if (NIL_P(rs)) return Qnil;
06372 StringValue(rs);
06373 rslen = RSTRING_LEN(rs);
06374 if (rslen == 0) {
06375 while (len>0 && p[len-1] == '\n') {
06376 len--;
06377 if (len>0 && p[len-1] == '\r')
06378 len--;
06379 }
06380 if (len < RSTRING_LEN(str)) {
06381 STR_SET_LEN(str, len);
06382 RSTRING_PTR(str)[len] = '\0';
06383 return str;
06384 }
06385 return Qnil;
06386 }
06387 if (rslen > len) return Qnil;
06388 newline = RSTRING_PTR(rs)[rslen-1];
06389 if (rslen == 1 && newline == '\n')
06390 goto smart_chomp;
06391
06392 enc = rb_enc_check(str, rs);
06393 if (is_broken_string(rs)) {
06394 return Qnil;
06395 }
06396 pp = e - rslen;
06397 if (p[len-1] == newline &&
06398 (rslen <= 1 ||
06399 memcmp(RSTRING_PTR(rs), pp, rslen) == 0)) {
06400 if (rb_enc_left_char_head(p, pp, e, enc) != pp)
06401 return Qnil;
06402 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
06403 ENC_CODERANGE_CLEAR(str);
06404 }
06405 STR_SET_LEN(str, RSTRING_LEN(str) - rslen);
06406 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06407 return str;
06408 }
06409 return Qnil;
06410 }
06411
06412
06413
06414
06415
06416
06417
06418
06419
06420
06421
06422
06423
06424
06425
06426
06427
06428
06429
06430
06431
06432 static VALUE
06433 rb_str_chomp(int argc, VALUE *argv, VALUE str)
06434 {
06435 str = rb_str_dup(str);
06436 rb_str_chomp_bang(argc, argv, str);
06437 return str;
06438 }
06439
06440
06441
06442
06443
06444
06445
06446
06447
06448
06449
06450
06451
06452 static VALUE
06453 rb_str_lstrip_bang(VALUE str)
06454 {
06455 rb_encoding *enc;
06456 char *s, *t, *e;
06457
06458 str_modify_keep_cr(str);
06459 enc = STR_ENC_GET(str);
06460 s = RSTRING_PTR(str);
06461 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06462 e = t = RSTRING_END(str);
06463
06464 while (s < e) {
06465 int n;
06466 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
06467
06468 if (!rb_isspace(cc)) break;
06469 s += n;
06470 }
06471
06472 if (s > RSTRING_PTR(str)) {
06473 STR_SET_LEN(str, t-s);
06474 memmove(RSTRING_PTR(str), s, RSTRING_LEN(str));
06475 RSTRING_PTR(str)[RSTRING_LEN(str)] = '\0';
06476 return str;
06477 }
06478 return Qnil;
06479 }
06480
06481
06482
06483
06484
06485
06486
06487
06488
06489
06490
06491
06492
06493 static VALUE
06494 rb_str_lstrip(VALUE str)
06495 {
06496 str = rb_str_dup(str);
06497 rb_str_lstrip_bang(str);
06498 return str;
06499 }
06500
06501
06502
06503
06504
06505
06506
06507
06508
06509
06510
06511
06512
06513
06514 static VALUE
06515 rb_str_rstrip_bang(VALUE str)
06516 {
06517 rb_encoding *enc;
06518 char *s, *t, *e;
06519
06520 str_modify_keep_cr(str);
06521 enc = STR_ENC_GET(str);
06522 rb_str_check_dummy_enc(enc);
06523 s = RSTRING_PTR(str);
06524 if (!s || RSTRING_LEN(str) == 0) return Qnil;
06525 t = e = RSTRING_END(str);
06526
06527
06528 if (single_byte_optimizable(str)) {
06529 unsigned char c;
06530 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
06531 }
06532 else {
06533 char *tp;
06534
06535 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
06536 unsigned int c = rb_enc_codepoint(tp, e, enc);
06537 if (c && !rb_isspace(c)) break;
06538 t = tp;
06539 }
06540 }
06541 if (t < e) {
06542 long len = t-RSTRING_PTR(str);
06543
06544 STR_SET_LEN(str, len);
06545 RSTRING_PTR(str)[len] = '\0';
06546 return str;
06547 }
06548 return Qnil;
06549 }
06550
06551
06552
06553
06554
06555
06556
06557
06558
06559
06560
06561
06562
06563 static VALUE
06564 rb_str_rstrip(VALUE str)
06565 {
06566 str = rb_str_dup(str);
06567 rb_str_rstrip_bang(str);
06568 return str;
06569 }
06570
06571
06572
06573
06574
06575
06576
06577
06578
06579
06580 static VALUE
06581 rb_str_strip_bang(VALUE str)
06582 {
06583 VALUE l = rb_str_lstrip_bang(str);
06584 VALUE r = rb_str_rstrip_bang(str);
06585
06586 if (NIL_P(l) && NIL_P(r)) return Qnil;
06587 return str;
06588 }
06589
06590
06591
06592
06593
06594
06595
06596
06597
06598
06599
06600
06601 static VALUE
06602 rb_str_strip(VALUE str)
06603 {
06604 str = rb_str_dup(str);
06605 rb_str_strip_bang(str);
06606 return str;
06607 }
06608
06609 static VALUE
06610 scan_once(VALUE str, VALUE pat, long *start)
06611 {
06612 VALUE result, match;
06613 struct re_registers *regs;
06614 int i;
06615
06616 if (rb_reg_search(pat, str, *start, 0) >= 0) {
06617 match = rb_backref_get();
06618 regs = RMATCH_REGS(match);
06619 if (BEG(0) == END(0)) {
06620 rb_encoding *enc = STR_ENC_GET(str);
06621
06622
06623
06624 if (RSTRING_LEN(str) > END(0))
06625 *start = END(0)+rb_enc_fast_mbclen(RSTRING_PTR(str)+END(0),
06626 RSTRING_END(str), enc);
06627 else
06628 *start = END(0)+1;
06629 }
06630 else {
06631 *start = END(0);
06632 }
06633 if (regs->num_regs == 1) {
06634 return rb_reg_nth_match(0, match);
06635 }
06636 result = rb_ary_new2(regs->num_regs);
06637 for (i=1; i < regs->num_regs; i++) {
06638 rb_ary_push(result, rb_reg_nth_match(i, match));
06639 }
06640
06641 return result;
06642 }
06643 return Qnil;
06644 }
06645
06646
06647
06648
06649
06650
06651
06652
06653
06654
06655
06656
06657
06658
06659
06660
06661
06662
06663
06664
06665
06666
06667
06668
06669
06670
06671
06672
06673
06674
06675
06676
06677
06678 static VALUE
06679 rb_str_scan(VALUE str, VALUE pat)
06680 {
06681 VALUE result;
06682 long start = 0;
06683 long last = -1, prev = 0;
06684 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
06685
06686 pat = get_pat(pat, 1);
06687 if (!rb_block_given_p()) {
06688 VALUE ary = rb_ary_new();
06689
06690 while (!NIL_P(result = scan_once(str, pat, &start))) {
06691 last = prev;
06692 prev = start;
06693 rb_ary_push(ary, result);
06694 }
06695 if (last >= 0) rb_reg_search(pat, str, last, 0);
06696 return ary;
06697 }
06698
06699 while (!NIL_P(result = scan_once(str, pat, &start))) {
06700 last = prev;
06701 prev = start;
06702 rb_yield(result);
06703 str_mod_check(str, p, len);
06704 }
06705 if (last >= 0) rb_reg_search(pat, str, last, 0);
06706 return str;
06707 }
06708
06709
06710
06711
06712
06713
06714
06715
06716
06717
06718
06719
06720
06721
06722
06723
06724 static VALUE
06725 rb_str_hex(VALUE str)
06726 {
06727 rb_encoding *enc = rb_enc_get(str);
06728
06729 if (!rb_enc_asciicompat(enc)) {
06730 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06731 }
06732 return rb_str_to_inum(str, 16, FALSE);
06733 }
06734
06735
06736
06737
06738
06739
06740
06741
06742
06743
06744
06745
06746
06747
06748
06749
06750 static VALUE
06751 rb_str_oct(VALUE str)
06752 {
06753 rb_encoding *enc = rb_enc_get(str);
06754
06755 if (!rb_enc_asciicompat(enc)) {
06756 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
06757 }
06758 return rb_str_to_inum(str, -8, FALSE);
06759 }
06760
06761
06762
06763
06764
06765
06766
06767
06768
06769
06770
06771
06772 static VALUE
06773 rb_str_crypt(VALUE str, VALUE salt)
06774 {
06775 extern char *crypt(const char *, const char *);
06776 VALUE result;
06777 const char *s, *saltp;
06778 #ifdef BROKEN_CRYPT
06779 char salt_8bit_clean[3];
06780 #endif
06781
06782 StringValue(salt);
06783 if (RSTRING_LEN(salt) < 2)
06784 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
06785
06786 s = RSTRING_PTR(str);
06787 if (!s) s = "";
06788 saltp = RSTRING_PTR(salt);
06789 #ifdef BROKEN_CRYPT
06790 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
06791 salt_8bit_clean[0] = saltp[0] & 0x7f;
06792 salt_8bit_clean[1] = saltp[1] & 0x7f;
06793 salt_8bit_clean[2] = '\0';
06794 saltp = salt_8bit_clean;
06795 }
06796 #endif
06797 result = rb_str_new2(crypt(s, saltp));
06798 OBJ_INFECT(result, str);
06799 OBJ_INFECT(result, salt);
06800 return result;
06801 }
06802
06803
06804
06805
06806
06807
06808
06809
06810
06811
06812
06813
06814
06815
06816
06817
06818
06819
06820
06821
06822
06823
06824 VALUE
06825 rb_str_intern(VALUE s)
06826 {
06827 VALUE str = RB_GC_GUARD(s);
06828 ID id;
06829
06830 id = rb_intern_str(str);
06831 return ID2SYM(id);
06832 }
06833
06834
06835
06836
06837
06838
06839
06840
06841
06842
06843
06844 VALUE
06845 rb_str_ord(VALUE s)
06846 {
06847 unsigned int c;
06848
06849 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
06850 return UINT2NUM(c);
06851 }
06852
06853
06854
06855
06856
06857
06858
06859
06860
06861
06862
06863 static VALUE
06864 rb_str_sum(int argc, VALUE *argv, VALUE str)
06865 {
06866 VALUE vbits;
06867 int bits;
06868 char *ptr, *p, *pend;
06869 long len;
06870 VALUE sum = INT2FIX(0);
06871 unsigned long sum0 = 0;
06872
06873 if (argc == 0) {
06874 bits = 16;
06875 }
06876 else {
06877 rb_scan_args(argc, argv, "01", &vbits);
06878 bits = NUM2INT(vbits);
06879 }
06880 ptr = p = RSTRING_PTR(str);
06881 len = RSTRING_LEN(str);
06882 pend = p + len;
06883
06884 while (p < pend) {
06885 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
06886 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06887 str_mod_check(str, ptr, len);
06888 sum0 = 0;
06889 }
06890 sum0 += (unsigned char)*p;
06891 p++;
06892 }
06893
06894 if (bits == 0) {
06895 if (sum0) {
06896 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06897 }
06898 }
06899 else {
06900 if (sum == INT2FIX(0)) {
06901 if (bits < (int)sizeof(long)*CHAR_BIT) {
06902 sum0 &= (((unsigned long)1)<<bits)-1;
06903 }
06904 sum = LONG2FIX(sum0);
06905 }
06906 else {
06907 VALUE mod;
06908
06909 if (sum0) {
06910 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
06911 }
06912
06913 mod = rb_funcall(INT2FIX(1), rb_intern("<<"), 1, INT2FIX(bits));
06914 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
06915 sum = rb_funcall(sum, '&', 1, mod);
06916 }
06917 }
06918 return sum;
06919 }
06920
06921 static VALUE
06922 rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
06923 {
06924 rb_encoding *enc;
06925 VALUE w;
06926 long width, len, flen = 1, fclen = 1;
06927 VALUE res;
06928 char *p;
06929 const char *f = " ";
06930 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
06931 volatile VALUE pad;
06932 int singlebyte = 1, cr;
06933
06934 rb_scan_args(argc, argv, "11", &w, &pad);
06935 enc = STR_ENC_GET(str);
06936 width = NUM2LONG(w);
06937 if (argc == 2) {
06938 StringValue(pad);
06939 enc = rb_enc_check(str, pad);
06940 f = RSTRING_PTR(pad);
06941 flen = RSTRING_LEN(pad);
06942 fclen = str_strlen(pad, enc);
06943 singlebyte = single_byte_optimizable(pad);
06944 if (flen == 0 || fclen == 0) {
06945 rb_raise(rb_eArgError, "zero width padding");
06946 }
06947 }
06948 len = str_strlen(str, enc);
06949 if (width < 0 || len >= width) return rb_str_dup(str);
06950 n = width - len;
06951 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
06952 rlen = n - llen;
06953 cr = ENC_CODERANGE(str);
06954 if (flen > 1) {
06955 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
06956 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
06957 }
06958 size = RSTRING_LEN(str);
06959 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
06960 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
06961 (len += llen2 + rlen2) >= LONG_MAX - size) {
06962 rb_raise(rb_eArgError, "argument too big");
06963 }
06964 len += size;
06965 res = rb_str_new5(str, 0, len);
06966 p = RSTRING_PTR(res);
06967 if (flen <= 1) {
06968 memset(p, *f, llen);
06969 p += llen;
06970 }
06971 else {
06972 while (llen >= fclen) {
06973 memcpy(p,f,flen);
06974 p += flen;
06975 llen -= fclen;
06976 }
06977 if (llen > 0) {
06978 memcpy(p, f, llen2);
06979 p += llen2;
06980 }
06981 }
06982 memcpy(p, RSTRING_PTR(str), size);
06983 p += size;
06984 if (flen <= 1) {
06985 memset(p, *f, rlen);
06986 p += rlen;
06987 }
06988 else {
06989 while (rlen >= fclen) {
06990 memcpy(p,f,flen);
06991 p += flen;
06992 rlen -= fclen;
06993 }
06994 if (rlen > 0) {
06995 memcpy(p, f, rlen2);
06996 p += rlen2;
06997 }
06998 }
06999 *p = '\0';
07000 STR_SET_LEN(res, p-RSTRING_PTR(res));
07001 OBJ_INFECT(res, str);
07002 if (!NIL_P(pad)) OBJ_INFECT(res, pad);
07003 rb_enc_associate(res, enc);
07004 if (argc == 2)
07005 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
07006 if (cr != ENC_CODERANGE_BROKEN)
07007 ENC_CODERANGE_SET(res, cr);
07008 return res;
07009 }
07010
07011
07012
07013
07014
07015
07016
07017
07018
07019
07020
07021
07022
07023
07024
07025 static VALUE
07026 rb_str_ljust(int argc, VALUE *argv, VALUE str)
07027 {
07028 return rb_str_justify(argc, argv, str, 'l');
07029 }
07030
07031
07032
07033
07034
07035
07036
07037
07038
07039
07040
07041
07042
07043
07044
07045 static VALUE
07046 rb_str_rjust(int argc, VALUE *argv, VALUE str)
07047 {
07048 return rb_str_justify(argc, argv, str, 'r');
07049 }
07050
07051
07052
07053
07054
07055
07056
07057
07058
07059
07060
07061
07062
07063
07064
07065 static VALUE
07066 rb_str_center(int argc, VALUE *argv, VALUE str)
07067 {
07068 return rb_str_justify(argc, argv, str, 'c');
07069 }
07070
07071
07072
07073
07074
07075
07076
07077
07078
07079
07080
07081
07082
07083
07084
07085
07086 static VALUE
07087 rb_str_partition(VALUE str, VALUE sep)
07088 {
07089 long pos;
07090 int regex = FALSE;
07091
07092 if (TYPE(sep) == T_REGEXP) {
07093 pos = rb_reg_search(sep, str, 0, 0);
07094 regex = TRUE;
07095 }
07096 else {
07097 VALUE tmp;
07098
07099 tmp = rb_check_string_type(sep);
07100 if (NIL_P(tmp)) {
07101 rb_raise(rb_eTypeError, "type mismatch: %s given",
07102 rb_obj_classname(sep));
07103 }
07104 sep = tmp;
07105 pos = rb_str_index(str, sep, 0);
07106 }
07107 if (pos < 0) {
07108 failed:
07109 return rb_ary_new3(3, str, str_new_empty(str), str_new_empty(str));
07110 }
07111 if (regex) {
07112 sep = rb_str_subpat(str, sep, INT2FIX(0));
07113 if (pos == 0 && RSTRING_LEN(sep) == 0) goto failed;
07114 }
07115 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
07116 sep,
07117 rb_str_subseq(str, pos+RSTRING_LEN(sep),
07118 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
07119 }
07120
07121
07122
07123
07124
07125
07126
07127
07128
07129
07130
07131
07132
07133
07134
07135
07136 static VALUE
07137 rb_str_rpartition(VALUE str, VALUE sep)
07138 {
07139 long pos = RSTRING_LEN(str);
07140 int regex = FALSE;
07141
07142 if (TYPE(sep) == T_REGEXP) {
07143 pos = rb_reg_search(sep, str, pos, 1);
07144 regex = TRUE;
07145 }
07146 else {
07147 VALUE tmp;
07148
07149 tmp = rb_check_string_type(sep);
07150 if (NIL_P(tmp)) {
07151 rb_raise(rb_eTypeError, "type mismatch: %s given",
07152 rb_obj_classname(sep));
07153 }
07154 sep = tmp;
07155 pos = rb_str_sublen(str, pos);
07156 pos = rb_str_rindex(str, sep, pos);
07157 }
07158 if (pos < 0) {
07159 return rb_ary_new3(3, str_new_empty(str), str_new_empty(str), str);
07160 }
07161 if (regex) {
07162 sep = rb_reg_nth_match(0, rb_backref_get());
07163 }
07164 return rb_ary_new3(3, rb_str_substr(str, 0, pos),
07165 sep,
07166 rb_str_substr(str,pos+str_strlen(sep,STR_ENC_GET(sep)),RSTRING_LEN(str)));
07167 }
07168
07169
07170
07171
07172
07173
07174
07175
07176
07177
07178
07179
07180
07181
07182
07183
07184
07185 static VALUE
07186 rb_str_start_with(int argc, VALUE *argv, VALUE str)
07187 {
07188 int i;
07189
07190 for (i=0; i<argc; i++) {
07191 VALUE tmp = rb_check_string_type(argv[i]);
07192 if (NIL_P(tmp)) continue;
07193 rb_enc_check(str, tmp);
07194 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07195 if (memcmp(RSTRING_PTR(str), RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07196 return Qtrue;
07197 }
07198 return Qfalse;
07199 }
07200
07201
07202
07203
07204
07205
07206
07207
07208 static VALUE
07209 rb_str_end_with(int argc, VALUE *argv, VALUE str)
07210 {
07211 int i;
07212 char *p, *s, *e;
07213 rb_encoding *enc;
07214
07215 for (i=0; i<argc; i++) {
07216 VALUE tmp = rb_check_string_type(argv[i]);
07217 if (NIL_P(tmp)) continue;
07218 enc = rb_enc_check(str, tmp);
07219 if (RSTRING_LEN(str) < RSTRING_LEN(tmp)) continue;
07220 p = RSTRING_PTR(str);
07221 e = p + RSTRING_LEN(str);
07222 s = e - RSTRING_LEN(tmp);
07223 if (rb_enc_left_char_head(p, s, e, enc) != s)
07224 continue;
07225 if (memcmp(s, RSTRING_PTR(tmp), RSTRING_LEN(tmp)) == 0)
07226 return Qtrue;
07227 }
07228 return Qfalse;
07229 }
07230
07231 void
07232 rb_str_setter(VALUE val, ID id, VALUE *var)
07233 {
07234 if (!NIL_P(val) && TYPE(val) != T_STRING) {
07235 rb_raise(rb_eTypeError, "value of %s must be String", rb_id2name(id));
07236 }
07237 *var = val;
07238 }
07239
07240
07241
07242
07243
07244
07245
07246
07247
07248 static VALUE
07249 rb_str_force_encoding(VALUE str, VALUE enc)
07250 {
07251 str_modifiable(str);
07252 rb_enc_associate(str, rb_to_encoding(enc));
07253 ENC_CODERANGE_CLEAR(str);
07254 return str;
07255 }
07256
07257
07258
07259
07260
07261
07262
07263
07264
07265
07266
07267
07268 static VALUE
07269 rb_str_valid_encoding_p(VALUE str)
07270 {
07271 int cr = rb_enc_str_coderange(str);
07272
07273 return cr == ENC_CODERANGE_BROKEN ? Qfalse : Qtrue;
07274 }
07275
07276
07277
07278
07279
07280
07281
07282
07283
07284
07285
07286 static VALUE
07287 rb_str_is_ascii_only_p(VALUE str)
07288 {
07289 int cr = rb_enc_str_coderange(str);
07290
07291 return cr == ENC_CODERANGE_7BIT ? Qtrue : Qfalse;
07292 }
07293
07308 VALUE
07309 rb_str_ellipsize(VALUE str, long len)
07310 {
07311 static const char ellipsis[] = "...";
07312 const long ellipsislen = sizeof(ellipsis) - 1;
07313 rb_encoding *const enc = rb_enc_get(str);
07314 const long blen = RSTRING_LEN(str);
07315 const char *const p = RSTRING_PTR(str), *e = p + blen;
07316 VALUE estr, ret = 0;
07317
07318 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
07319 if (len * rb_enc_mbminlen(enc) >= blen ||
07320 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
07321 ret = str;
07322 }
07323 else if (len <= ellipsislen ||
07324 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
07325 if (rb_enc_asciicompat(enc)) {
07326 ret = rb_str_new_with_class(str, ellipsis, len);
07327 rb_enc_associate(ret, enc);
07328 }
07329 else {
07330 estr = rb_usascii_str_new(ellipsis, len);
07331 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
07332 }
07333 }
07334 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
07335 rb_str_cat(ret, ellipsis, ellipsislen);
07336 }
07337 else {
07338 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
07339 rb_enc_from_encoding(enc), 0, Qnil);
07340 rb_str_append(ret, estr);
07341 }
07342 return ret;
07343 }
07344
07345
07346
07347
07348
07349
07350
07351
07352
07353
07354
07355
07356
07357
07358
07359
07360
07361
07362
07363
07364
07365
07366
07367
07368
07369
07370
07371
07372
07373
07374
07375
07376
07377
07378
07379
07380
07381
07382
07383
07384
07385
07386
07387 static VALUE
07388 sym_equal(VALUE sym1, VALUE sym2)
07389 {
07390 if (sym1 == sym2) return Qtrue;
07391 return Qfalse;
07392 }
07393
07394
07395 static int
07396 sym_printable(const char *s, const char *send, rb_encoding *enc)
07397 {
07398 while (s < send) {
07399 int n;
07400 int c = rb_enc_codepoint_len(s, send, &n, enc);
07401
07402 if (!rb_enc_isprint(c, enc)) return FALSE;
07403 s += n;
07404 }
07405 return TRUE;
07406 }
07407
07408
07409
07410
07411
07412
07413
07414
07415
07416
07417 static VALUE
07418 sym_inspect(VALUE sym)
07419 {
07420 VALUE str;
07421 ID id = SYM2ID(sym);
07422 rb_encoding *enc;
07423 const char *ptr;
07424 long len;
07425 char *dest;
07426 rb_encoding *resenc = rb_default_internal_encoding();
07427
07428 if (resenc == NULL) resenc = rb_default_external_encoding();
07429 sym = rb_id2str(id);
07430 enc = STR_ENC_GET(sym);
07431 ptr = RSTRING_PTR(sym);
07432 len = RSTRING_LEN(sym);
07433 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
07434 !rb_enc_symname_p(ptr, enc) || !sym_printable(ptr, ptr + len, enc)) {
07435 str = rb_str_inspect(sym);
07436 len = RSTRING_LEN(str);
07437 rb_str_resize(str, len + 1);
07438 dest = RSTRING_PTR(str);
07439 memmove(dest + 1, dest, len);
07440 dest[0] = ':';
07441 }
07442 else {
07443 char *dest;
07444 str = rb_enc_str_new(0, len + 1, enc);
07445 dest = RSTRING_PTR(str);
07446 dest[0] = ':';
07447 memcpy(dest + 1, ptr, len);
07448 }
07449 return str;
07450 }
07451
07452
07453
07454
07455
07456
07457
07458
07459
07460
07461
07462
07463
07464 VALUE
07465 rb_sym_to_s(VALUE sym)
07466 {
07467 ID id = SYM2ID(sym);
07468
07469 return str_new3(rb_cString, rb_id2str(id));
07470 }
07471
07472
07473
07474
07475
07476
07477
07478
07479
07480
07481
07482
07483 static VALUE
07484 sym_to_sym(VALUE sym)
07485 {
07486 return sym;
07487 }
07488
07489 static VALUE
07490 sym_call(VALUE args, VALUE sym, int argc, VALUE *argv)
07491 {
07492 VALUE obj;
07493
07494 if (argc < 1) {
07495 rb_raise(rb_eArgError, "no receiver given");
07496 }
07497 obj = argv[0];
07498 return rb_funcall_passing_block(obj, (ID)sym, argc - 1, argv + 1);
07499 }
07500
07501
07502
07503
07504
07505
07506
07507
07508
07509
07510 static VALUE
07511 sym_to_proc(VALUE sym)
07512 {
07513 static VALUE sym_proc_cache = Qfalse;
07514 enum {SYM_PROC_CACHE_SIZE = 67};
07515 VALUE proc;
07516 long id, index;
07517 VALUE *aryp;
07518
07519 if (!sym_proc_cache) {
07520 sym_proc_cache = rb_ary_tmp_new(SYM_PROC_CACHE_SIZE * 2);
07521 rb_gc_register_mark_object(sym_proc_cache);
07522 rb_ary_store(sym_proc_cache, SYM_PROC_CACHE_SIZE*2 - 1, Qnil);
07523 }
07524
07525 id = SYM2ID(sym);
07526 index = (id % SYM_PROC_CACHE_SIZE) << 1;
07527
07528 aryp = RARRAY_PTR(sym_proc_cache);
07529 if (aryp[index] == sym) {
07530 return aryp[index + 1];
07531 }
07532 else {
07533 proc = rb_proc_new(sym_call, (VALUE)id);
07534 aryp[index] = sym;
07535 aryp[index + 1] = proc;
07536 return proc;
07537 }
07538 }
07539
07540
07541
07542
07543
07544
07545
07546
07547
07548 static VALUE
07549 sym_succ(VALUE sym)
07550 {
07551 return rb_str_intern(rb_str_succ(rb_sym_to_s(sym)));
07552 }
07553
07554
07555
07556
07557
07558
07559
07560
07561
07562 static VALUE
07563 sym_cmp(VALUE sym, VALUE other)
07564 {
07565 if (!SYMBOL_P(other)) {
07566 return Qnil;
07567 }
07568 return rb_str_cmp_m(rb_sym_to_s(sym), rb_sym_to_s(other));
07569 }
07570
07571
07572
07573
07574
07575
07576
07577
07578
07579 static VALUE
07580 sym_casecmp(VALUE sym, VALUE other)
07581 {
07582 if (!SYMBOL_P(other)) {
07583 return Qnil;
07584 }
07585 return rb_str_casecmp(rb_sym_to_s(sym), rb_sym_to_s(other));
07586 }
07587
07588
07589
07590
07591
07592
07593
07594
07595 static VALUE
07596 sym_match(VALUE sym, VALUE other)
07597 {
07598 return rb_str_match(rb_sym_to_s(sym), other);
07599 }
07600
07601
07602
07603
07604
07605
07606
07607
07608
07609 static VALUE
07610 sym_aref(int argc, VALUE *argv, VALUE sym)
07611 {
07612 return rb_str_aref_m(argc, argv, rb_sym_to_s(sym));
07613 }
07614
07615
07616
07617
07618
07619
07620
07621
07622 static VALUE
07623 sym_length(VALUE sym)
07624 {
07625 return rb_str_length(rb_id2str(SYM2ID(sym)));
07626 }
07627
07628
07629
07630
07631
07632
07633
07634
07635 static VALUE
07636 sym_empty(VALUE sym)
07637 {
07638 return rb_str_empty(rb_id2str(SYM2ID(sym)));
07639 }
07640
07641
07642
07643
07644
07645
07646
07647
07648 static VALUE
07649 sym_upcase(VALUE sym)
07650 {
07651 return rb_str_intern(rb_str_upcase(rb_id2str(SYM2ID(sym))));
07652 }
07653
07654
07655
07656
07657
07658
07659
07660
07661 static VALUE
07662 sym_downcase(VALUE sym)
07663 {
07664 return rb_str_intern(rb_str_downcase(rb_id2str(SYM2ID(sym))));
07665 }
07666
07667
07668
07669
07670
07671
07672
07673
07674 static VALUE
07675 sym_capitalize(VALUE sym)
07676 {
07677 return rb_str_intern(rb_str_capitalize(rb_id2str(SYM2ID(sym))));
07678 }
07679
07680
07681
07682
07683
07684
07685
07686
07687 static VALUE
07688 sym_swapcase(VALUE sym)
07689 {
07690 return rb_str_intern(rb_str_swapcase(rb_id2str(SYM2ID(sym))));
07691 }
07692
07693
07694
07695
07696
07697
07698
07699
07700 static VALUE
07701 sym_encoding(VALUE sym)
07702 {
07703 return rb_obj_encoding(rb_id2str(SYM2ID(sym)));
07704 }
07705
07706 ID
07707 rb_to_id(VALUE name)
07708 {
07709 VALUE tmp;
07710
07711 switch (TYPE(name)) {
07712 default:
07713 tmp = rb_check_string_type(name);
07714 if (NIL_P(tmp)) {
07715 tmp = rb_inspect(name);
07716 rb_raise(rb_eTypeError, "%s is not a symbol",
07717 RSTRING_PTR(tmp));
07718 }
07719 name = tmp;
07720
07721 case T_STRING:
07722 name = rb_str_intern(name);
07723
07724 case T_SYMBOL:
07725 return SYM2ID(name);
07726 }
07727 return Qnil;
07728 }
07729
07730
07731
07732
07733
07734
07735
07736
07737
07738
07739
07740
07741
07742
07743 void
07744 Init_String(void)
07745 {
07746 #undef rb_intern
07747 #define rb_intern(str) rb_intern_const(str)
07748
07749 rb_cString = rb_define_class("String", rb_cObject);
07750 rb_include_module(rb_cString, rb_mComparable);
07751 rb_define_alloc_func(rb_cString, str_alloc);
07752 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
07753 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
07754 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
07755 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
07756 rb_define_method(rb_cString, "==", rb_str_equal, 1);
07757 rb_define_method(rb_cString, "===", rb_str_equal, 1);
07758 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
07759 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
07760 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
07761 rb_define_method(rb_cString, "+", rb_str_plus, 1);
07762 rb_define_method(rb_cString, "*", rb_str_times, 1);
07763 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
07764 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
07765 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
07766 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
07767 rb_define_method(rb_cString, "length", rb_str_length, 0);
07768 rb_define_method(rb_cString, "size", rb_str_length, 0);
07769 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
07770 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
07771 rb_define_method(rb_cString, "=~", rb_str_match, 1);
07772 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
07773 rb_define_method(rb_cString, "succ", rb_str_succ, 0);
07774 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
07775 rb_define_method(rb_cString, "next", rb_str_succ, 0);
07776 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
07777 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
07778 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
07779 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
07780 rb_define_method(rb_cString, "replace", rb_str_replace, 1);
07781 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
07782 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
07783 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
07784 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
07785 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
07786
07787 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
07788 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
07789 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
07790 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
07791 rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
07792 rb_define_method(rb_cString, "dump", rb_str_dump, 0);
07793
07794 rb_define_method(rb_cString, "upcase", rb_str_upcase, 0);
07795 rb_define_method(rb_cString, "downcase", rb_str_downcase, 0);
07796 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, 0);
07797 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, 0);
07798
07799 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, 0);
07800 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, 0);
07801 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, 0);
07802 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, 0);
07803
07804 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
07805 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
07806 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
07807 rb_define_method(rb_cString, "lines", rb_str_each_line, -1);
07808 rb_define_method(rb_cString, "bytes", rb_str_each_byte, 0);
07809 rb_define_method(rb_cString, "chars", rb_str_each_char, 0);
07810 rb_define_method(rb_cString, "codepoints", rb_str_each_codepoint, 0);
07811 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
07812 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
07813 rb_define_method(rb_cString, "concat", rb_str_concat, 1);
07814 rb_define_method(rb_cString, "<<", rb_str_concat, 1);
07815 rb_define_method(rb_cString, "prepend", rb_str_prepend, 1);
07816 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
07817 rb_define_method(rb_cString, "intern", rb_str_intern, 0);
07818 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0);
07819 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
07820
07821 rb_define_method(rb_cString, "include?", rb_str_include, 1);
07822 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
07823 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
07824
07825 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
07826
07827 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
07828 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
07829 rb_define_method(rb_cString, "center", rb_str_center, -1);
07830
07831 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
07832 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
07833 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
07834 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
07835 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
07836 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
07837 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
07838
07839 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
07840 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
07841 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
07842 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
07843 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
07844 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
07845 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
07846
07847 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
07848 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
07849 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
07850 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
07851 rb_define_method(rb_cString, "count", rb_str_count, -1);
07852
07853 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
07854 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
07855 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
07856 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
07857
07858 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
07859 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
07860 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
07861 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
07862
07863 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
07864
07865 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
07866 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
07867
07868 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
07869 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
07870
07871 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0);
07872 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
07873 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
07874 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
07875
07876 id_to_s = rb_intern("to_s");
07877
07878 rb_fs = Qnil;
07879 rb_define_variable("$;", &rb_fs);
07880 rb_define_variable("$-F", &rb_fs);
07881
07882 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
07883 rb_include_module(rb_cSymbol, rb_mComparable);
07884 rb_undef_alloc_func(rb_cSymbol);
07885 rb_undef_method(CLASS_OF(rb_cSymbol), "new");
07886 rb_define_singleton_method(rb_cSymbol, "all_symbols", rb_sym_all_symbols, 0);
07887
07888 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
07889 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
07890 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
07891 rb_define_method(rb_cSymbol, "to_s", rb_sym_to_s, 0);
07892 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
07893 rb_define_method(rb_cSymbol, "intern", sym_to_sym, 0);
07894 rb_define_method(rb_cSymbol, "to_sym", sym_to_sym, 0);
07895 rb_define_method(rb_cSymbol, "to_proc", sym_to_proc, 0);
07896 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
07897 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
07898
07899 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
07900 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
07901 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
07902
07903 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
07904 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
07905 rb_define_method(rb_cSymbol, "length", sym_length, 0);
07906 rb_define_method(rb_cSymbol, "size", sym_length, 0);
07907 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
07908 rb_define_method(rb_cSymbol, "match", sym_match, 1);
07909
07910 rb_define_method(rb_cSymbol, "upcase", sym_upcase, 0);
07911 rb_define_method(rb_cSymbol, "downcase", sym_downcase, 0);
07912 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, 0);
07913 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, 0);
07914
07915 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
07916 }
07917