00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "ruby/ruby.h"
00013 #include "ruby/encoding.h"
00014 #include "internal.h"
00015 #include "transcode_data.h"
00016 #include <ctype.h>
00017
00018 #define ENABLE_ECONV_NEWLINE_OPTION 1
00019
00020
00021 VALUE rb_eUndefinedConversionError;
00022 VALUE rb_eInvalidByteSequenceError;
00023 VALUE rb_eConverterNotFoundError;
00024
00025 VALUE rb_cEncodingConverter;
00026
00027 static VALUE sym_invalid, sym_undef, sym_replace, sym_fallback, sym_aref;
00028 static VALUE sym_xml, sym_text, sym_attr;
00029 static VALUE sym_universal_newline;
00030 static VALUE sym_crlf_newline;
00031 static VALUE sym_cr_newline;
00032 #ifdef ENABLE_ECONV_NEWLINE_OPTION
00033 static VALUE sym_newline, sym_universal, sym_crlf, sym_cr, sym_lf;
00034 #endif
00035 static VALUE sym_partial_input;
00036
00037 static VALUE sym_invalid_byte_sequence;
00038 static VALUE sym_undefined_conversion;
00039 static VALUE sym_destination_buffer_full;
00040 static VALUE sym_source_buffer_empty;
00041 static VALUE sym_finished;
00042 static VALUE sym_after_output;
00043 static VALUE sym_incomplete_input;
00044
00045 static unsigned char *
00046 allocate_converted_string(const char *sname, const char *dname,
00047 const unsigned char *str, size_t len,
00048 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
00049 size_t *dst_len_ptr);
00050
00051
00052
00053 typedef struct rb_transcoding {
00054 const rb_transcoder *transcoder;
00055
00056 int flags;
00057
00058 int resume_position;
00059 unsigned int next_table;
00060 VALUE next_info;
00061 unsigned char next_byte;
00062 unsigned int output_index;
00063
00064 ssize_t recognized_len;
00065 ssize_t readagain_len;
00066 union {
00067 unsigned char ary[8];
00068 unsigned char *ptr;
00069 } readbuf;
00070
00071 ssize_t writebuf_off;
00072 ssize_t writebuf_len;
00073 union {
00074 unsigned char ary[8];
00075 unsigned char *ptr;
00076 } writebuf;
00077
00078 union rb_transcoding_state_t {
00079 void *ptr;
00080 char ary[sizeof(double) > sizeof(void*) ? sizeof(double) : sizeof(void*)];
00081 double dummy_for_alignment;
00082 } state;
00083 } rb_transcoding;
00084 #define TRANSCODING_READBUF(tc) \
00085 ((tc)->transcoder->max_input <= (int)sizeof((tc)->readbuf.ary) ? \
00086 (tc)->readbuf.ary : \
00087 (tc)->readbuf.ptr)
00088 #define TRANSCODING_WRITEBUF(tc) \
00089 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00090 (tc)->writebuf.ary : \
00091 (tc)->writebuf.ptr)
00092 #define TRANSCODING_WRITEBUF_SIZE(tc) \
00093 ((tc)->transcoder->max_output <= (int)sizeof((tc)->writebuf.ary) ? \
00094 sizeof((tc)->writebuf.ary) : \
00095 (size_t)(tc)->transcoder->max_output)
00096 #define TRANSCODING_STATE_EMBED_MAX ((int)sizeof(union rb_transcoding_state_t))
00097 #define TRANSCODING_STATE(tc) \
00098 ((tc)->transcoder->state_size <= (int)sizeof((tc)->state) ? \
00099 (tc)->state.ary : \
00100 (tc)->state.ptr)
00101
00102 typedef struct {
00103 struct rb_transcoding *tc;
00104 unsigned char *out_buf_start;
00105 unsigned char *out_data_start;
00106 unsigned char *out_data_end;
00107 unsigned char *out_buf_end;
00108 rb_econv_result_t last_result;
00109 } rb_econv_elem_t;
00110
00111 struct rb_econv_t {
00112 int flags;
00113 const char *source_encoding_name;
00114 const char *destination_encoding_name;
00115
00116 int started;
00117
00118 const unsigned char *replacement_str;
00119 size_t replacement_len;
00120 const char *replacement_enc;
00121 int replacement_allocated;
00122
00123 unsigned char *in_buf_start;
00124 unsigned char *in_data_start;
00125 unsigned char *in_data_end;
00126 unsigned char *in_buf_end;
00127 rb_econv_elem_t *elems;
00128 int num_allocated;
00129 int num_trans;
00130 int num_finished;
00131 struct rb_transcoding *last_tc;
00132
00133
00134 struct {
00135 rb_econv_result_t result;
00136 struct rb_transcoding *error_tc;
00137 const char *source_encoding;
00138 const char *destination_encoding;
00139 const unsigned char *error_bytes_start;
00140 size_t error_bytes_len;
00141 size_t readagain_len;
00142 } last_error;
00143
00144
00145
00146 rb_encoding *source_encoding;
00147 rb_encoding *destination_encoding;
00148 };
00149
00150
00151
00152
00153
00154 #define DECORATOR_P(sname, dname) (*(sname) == '\0')
00155
00156 typedef struct {
00157 const char *sname;
00158 const char *dname;
00159 const char *lib;
00160 const rb_transcoder *transcoder;
00161 } transcoder_entry_t;
00162
00163 static st_table *transcoder_table;
00164
00165 static transcoder_entry_t *
00166 make_transcoder_entry(const char *sname, const char *dname)
00167 {
00168 st_data_t val;
00169 st_table *table2;
00170
00171 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00172 val = (st_data_t)st_init_strcasetable();
00173 st_add_direct(transcoder_table, (st_data_t)sname, val);
00174 }
00175 table2 = (st_table *)val;
00176 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00177 transcoder_entry_t *entry = ALLOC(transcoder_entry_t);
00178 entry->sname = sname;
00179 entry->dname = dname;
00180 entry->lib = NULL;
00181 entry->transcoder = NULL;
00182 val = (st_data_t)entry;
00183 st_add_direct(table2, (st_data_t)dname, val);
00184 }
00185 return (transcoder_entry_t *)val;
00186 }
00187
00188 static transcoder_entry_t *
00189 get_transcoder_entry(const char *sname, const char *dname)
00190 {
00191 st_data_t val;
00192 st_table *table2;
00193
00194 if (!st_lookup(transcoder_table, (st_data_t)sname, &val)) {
00195 return NULL;
00196 }
00197 table2 = (st_table *)val;
00198 if (!st_lookup(table2, (st_data_t)dname, &val)) {
00199 return NULL;
00200 }
00201 return (transcoder_entry_t *)val;
00202 }
00203
00204 void
00205 rb_register_transcoder(const rb_transcoder *tr)
00206 {
00207 const char *const sname = tr->src_encoding;
00208 const char *const dname = tr->dst_encoding;
00209
00210 transcoder_entry_t *entry;
00211
00212 entry = make_transcoder_entry(sname, dname);
00213 if (entry->transcoder) {
00214 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
00215 sname, dname);
00216 }
00217
00218 entry->transcoder = tr;
00219 }
00220
00221 static void
00222 declare_transcoder(const char *sname, const char *dname, const char *lib)
00223 {
00224 transcoder_entry_t *entry;
00225
00226 entry = make_transcoder_entry(sname, dname);
00227 entry->lib = lib;
00228 }
00229
00230 #define MAX_TRANSCODER_LIBNAME_LEN 64
00231 static const char transcoder_lib_prefix[] = "enc/trans/";
00232
00233 void
00234 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
00235 {
00236 if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
00237 rb_raise(rb_eArgError, "invalid library name - %s",
00238 lib ? lib : "(null)");
00239 }
00240 declare_transcoder(enc1, enc2, lib);
00241 }
00242
00243 #define encoding_equal(enc1, enc2) (STRCASECMP((enc1), (enc2)) == 0)
00244
00245 typedef struct search_path_queue_tag {
00246 struct search_path_queue_tag *next;
00247 const char *enc;
00248 } search_path_queue_t;
00249
00250 typedef struct {
00251 st_table *visited;
00252 search_path_queue_t *queue;
00253 search_path_queue_t **queue_last_ptr;
00254 const char *base_enc;
00255 } search_path_bfs_t;
00256
00257 static int
00258 transcode_search_path_i(st_data_t key, st_data_t val, st_data_t arg)
00259 {
00260 const char *dname = (const char *)key;
00261 search_path_bfs_t *bfs = (search_path_bfs_t *)arg;
00262 search_path_queue_t *q;
00263
00264 if (st_lookup(bfs->visited, (st_data_t)dname, &val)) {
00265 return ST_CONTINUE;
00266 }
00267
00268 q = ALLOC(search_path_queue_t);
00269 q->enc = dname;
00270 q->next = NULL;
00271 *bfs->queue_last_ptr = q;
00272 bfs->queue_last_ptr = &q->next;
00273
00274 st_add_direct(bfs->visited, (st_data_t)dname, (st_data_t)bfs->base_enc);
00275 return ST_CONTINUE;
00276 }
00277
00278 static int
00279 transcode_search_path(const char *sname, const char *dname,
00280 void (*callback)(const char *sname, const char *dname, int depth, void *arg),
00281 void *arg)
00282 {
00283 search_path_bfs_t bfs;
00284 search_path_queue_t *q;
00285 st_data_t val;
00286 st_table *table2;
00287 int found;
00288 int pathlen = -1;
00289
00290 if (encoding_equal(sname, dname))
00291 return -1;
00292
00293 q = ALLOC(search_path_queue_t);
00294 q->enc = sname;
00295 q->next = NULL;
00296 bfs.queue_last_ptr = &q->next;
00297 bfs.queue = q;
00298
00299 bfs.visited = st_init_strcasetable();
00300 st_add_direct(bfs.visited, (st_data_t)sname, (st_data_t)NULL);
00301
00302 while (bfs.queue) {
00303 q = bfs.queue;
00304 bfs.queue = q->next;
00305 if (!bfs.queue)
00306 bfs.queue_last_ptr = &bfs.queue;
00307
00308 if (!st_lookup(transcoder_table, (st_data_t)q->enc, &val)) {
00309 xfree(q);
00310 continue;
00311 }
00312 table2 = (st_table *)val;
00313
00314 if (st_lookup(table2, (st_data_t)dname, &val)) {
00315 st_add_direct(bfs.visited, (st_data_t)dname, (st_data_t)q->enc);
00316 xfree(q);
00317 found = 1;
00318 goto cleanup;
00319 }
00320
00321 bfs.base_enc = q->enc;
00322 st_foreach(table2, transcode_search_path_i, (st_data_t)&bfs);
00323 bfs.base_enc = NULL;
00324
00325 xfree(q);
00326 }
00327 found = 0;
00328
00329 cleanup:
00330 while (bfs.queue) {
00331 q = bfs.queue;
00332 bfs.queue = q->next;
00333 xfree(q);
00334 }
00335
00336 if (found) {
00337 const char *enc = dname;
00338 int depth;
00339 pathlen = 0;
00340 while (1) {
00341 st_lookup(bfs.visited, (st_data_t)enc, &val);
00342 if (!val)
00343 break;
00344 pathlen++;
00345 enc = (const char *)val;
00346 }
00347 depth = pathlen;
00348 enc = dname;
00349 while (1) {
00350 st_lookup(bfs.visited, (st_data_t)enc, &val);
00351 if (!val)
00352 break;
00353 callback((const char *)val, enc, --depth, arg);
00354 enc = (const char *)val;
00355 }
00356 }
00357
00358 st_free_table(bfs.visited);
00359
00360 return pathlen;
00361 }
00362
00363 static const rb_transcoder *
00364 load_transcoder_entry(transcoder_entry_t *entry)
00365 {
00366 if (entry->transcoder)
00367 return entry->transcoder;
00368
00369 if (entry->lib) {
00370 const char *lib = entry->lib;
00371 size_t len = strlen(lib);
00372 char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
00373 VALUE fn;
00374 const int safe = rb_safe_level();
00375
00376 entry->lib = NULL;
00377
00378 if (len > MAX_TRANSCODER_LIBNAME_LEN)
00379 return NULL;
00380 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
00381 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
00382 fn = rb_str_new2(path);
00383 FL_UNSET(fn, FL_TAINT|FL_UNTRUSTED);
00384 OBJ_FREEZE(fn);
00385 if (!rb_require_safe(fn, safe > 3 ? 3 : safe))
00386 return NULL;
00387 }
00388
00389 if (entry->transcoder)
00390 return entry->transcoder;
00391
00392 return NULL;
00393 }
00394
00395 static const char*
00396 get_replacement_character(const char *encname, size_t *len_ret, const char **repl_encname_ptr)
00397 {
00398 if (encoding_equal(encname, "UTF-8")) {
00399 *len_ret = 3;
00400 *repl_encname_ptr = "UTF-8";
00401 return "\xEF\xBF\xBD";
00402 }
00403 else {
00404 *len_ret = 1;
00405 *repl_encname_ptr = "US-ASCII";
00406 return "?";
00407 }
00408 }
00409
00410
00411
00412
00413
00414 static const unsigned char *
00415 transcode_char_start(rb_transcoding *tc,
00416 const unsigned char *in_start,
00417 const unsigned char *inchar_start,
00418 const unsigned char *in_p,
00419 size_t *char_len_ptr)
00420 {
00421 const unsigned char *ptr;
00422 if (inchar_start - in_start < tc->recognized_len) {
00423 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len,
00424 inchar_start, unsigned char, in_p - inchar_start);
00425 ptr = TRANSCODING_READBUF(tc);
00426 }
00427 else {
00428 ptr = inchar_start - tc->recognized_len;
00429 }
00430 *char_len_ptr = tc->recognized_len + (in_p - inchar_start);
00431 return ptr;
00432 }
00433
00434 static rb_econv_result_t
00435 transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
00436 const unsigned char *in_stop, unsigned char *out_stop,
00437 rb_transcoding *tc,
00438 const int opt)
00439 {
00440 const rb_transcoder *tr = tc->transcoder;
00441 int unitlen = tr->input_unit_length;
00442 ssize_t readagain_len = 0;
00443
00444 const unsigned char *inchar_start;
00445 const unsigned char *in_p;
00446
00447 unsigned char *out_p;
00448
00449 in_p = inchar_start = *in_pos;
00450
00451 out_p = *out_pos;
00452
00453 #define SUSPEND(ret, num) \
00454 do { \
00455 tc->resume_position = (num); \
00456 if (0 < in_p - inchar_start) \
00457 MEMMOVE(TRANSCODING_READBUF(tc)+tc->recognized_len, \
00458 inchar_start, unsigned char, in_p - inchar_start); \
00459 *in_pos = in_p; \
00460 *out_pos = out_p; \
00461 tc->recognized_len += in_p - inchar_start; \
00462 if (readagain_len) { \
00463 tc->recognized_len -= readagain_len; \
00464 tc->readagain_len = readagain_len; \
00465 } \
00466 return (ret); \
00467 resume_label ## num:; \
00468 } while (0)
00469 #define SUSPEND_OBUF(num) \
00470 do { \
00471 while (out_stop - out_p < 1) { SUSPEND(econv_destination_buffer_full, num); } \
00472 } while (0)
00473
00474 #define SUSPEND_AFTER_OUTPUT(num) \
00475 if ((opt & ECONV_AFTER_OUTPUT) && *out_pos != out_p) { \
00476 SUSPEND(econv_after_output, num); \
00477 }
00478
00479 #define next_table (tc->next_table)
00480 #define next_info (tc->next_info)
00481 #define next_byte (tc->next_byte)
00482 #define writebuf_len (tc->writebuf_len)
00483 #define writebuf_off (tc->writebuf_off)
00484
00485 switch (tc->resume_position) {
00486 case 0: break;
00487 case 1: goto resume_label1;
00488 case 2: goto resume_label2;
00489 case 3: goto resume_label3;
00490 case 4: goto resume_label4;
00491 case 5: goto resume_label5;
00492 case 6: goto resume_label6;
00493 case 7: goto resume_label7;
00494 case 8: goto resume_label8;
00495 case 9: goto resume_label9;
00496 case 10: goto resume_label10;
00497 case 11: goto resume_label11;
00498 case 12: goto resume_label12;
00499 case 13: goto resume_label13;
00500 case 14: goto resume_label14;
00501 case 15: goto resume_label15;
00502 case 16: goto resume_label16;
00503 case 17: goto resume_label17;
00504 case 18: goto resume_label18;
00505 case 19: goto resume_label19;
00506 case 20: goto resume_label20;
00507 case 21: goto resume_label21;
00508 case 22: goto resume_label22;
00509 case 23: goto resume_label23;
00510 case 24: goto resume_label24;
00511 case 25: goto resume_label25;
00512 case 26: goto resume_label26;
00513 case 27: goto resume_label27;
00514 case 28: goto resume_label28;
00515 case 29: goto resume_label29;
00516 case 30: goto resume_label30;
00517 case 31: goto resume_label31;
00518 case 32: goto resume_label32;
00519 case 33: goto resume_label33;
00520 case 34: goto resume_label34;
00521 }
00522
00523 while (1) {
00524 inchar_start = in_p;
00525 tc->recognized_len = 0;
00526 next_table = tr->conv_tree_start;
00527
00528 SUSPEND_AFTER_OUTPUT(24);
00529
00530 if (in_stop <= in_p) {
00531 if (!(opt & ECONV_PARTIAL_INPUT))
00532 break;
00533 SUSPEND(econv_source_buffer_empty, 7);
00534 continue;
00535 }
00536
00537 #define BYTE_ADDR(index) (tr->byte_array + (index))
00538 #define WORD_ADDR(index) (tr->word_array + INFO2WORDINDEX(index))
00539 #define BL_BASE BYTE_ADDR(BYTE_LOOKUP_BASE(WORD_ADDR(next_table)))
00540 #define BL_INFO WORD_ADDR(BYTE_LOOKUP_INFO(WORD_ADDR(next_table)))
00541 #define BL_MIN_BYTE (BL_BASE[0])
00542 #define BL_MAX_BYTE (BL_BASE[1])
00543 #define BL_OFFSET(byte) (BL_BASE[2+(byte)-BL_MIN_BYTE])
00544 #define BL_ACTION(byte) (BL_INFO[BL_OFFSET((byte))])
00545
00546 next_byte = (unsigned char)*in_p++;
00547 follow_byte:
00548 if (next_byte < BL_MIN_BYTE || BL_MAX_BYTE < next_byte)
00549 next_info = INVALID;
00550 else {
00551 next_info = (VALUE)BL_ACTION(next_byte);
00552 }
00553 follow_info:
00554 switch (next_info & 0x1F) {
00555 case NOMAP:
00556 {
00557 const unsigned char *p = inchar_start;
00558 writebuf_off = 0;
00559 while (p < in_p) {
00560 TRANSCODING_WRITEBUF(tc)[writebuf_off++] = (unsigned char)*p++;
00561 }
00562 writebuf_len = writebuf_off;
00563 writebuf_off = 0;
00564 while (writebuf_off < writebuf_len) {
00565 SUSPEND_OBUF(3);
00566 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00567 }
00568 }
00569 continue;
00570 case 0x00: case 0x04: case 0x08: case 0x0C:
00571 case 0x10: case 0x14: case 0x18: case 0x1C:
00572 SUSPEND_AFTER_OUTPUT(25);
00573 while (in_p >= in_stop) {
00574 if (!(opt & ECONV_PARTIAL_INPUT))
00575 goto incomplete;
00576 SUSPEND(econv_source_buffer_empty, 5);
00577 }
00578 next_byte = (unsigned char)*in_p++;
00579 next_table = (unsigned int)next_info;
00580 goto follow_byte;
00581 case ZERObt:
00582 continue;
00583 case ONEbt:
00584 SUSPEND_OBUF(9); *out_p++ = getBT1(next_info);
00585 continue;
00586 case TWObt:
00587 SUSPEND_OBUF(10); *out_p++ = getBT1(next_info);
00588 SUSPEND_OBUF(21); *out_p++ = getBT2(next_info);
00589 continue;
00590 case THREEbt:
00591 SUSPEND_OBUF(11); *out_p++ = getBT1(next_info);
00592 SUSPEND_OBUF(15); *out_p++ = getBT2(next_info);
00593 SUSPEND_OBUF(16); *out_p++ = getBT3(next_info);
00594 continue;
00595 case FOURbt:
00596 SUSPEND_OBUF(12); *out_p++ = getBT0(next_info);
00597 SUSPEND_OBUF(17); *out_p++ = getBT1(next_info);
00598 SUSPEND_OBUF(18); *out_p++ = getBT2(next_info);
00599 SUSPEND_OBUF(19); *out_p++ = getBT3(next_info);
00600 continue;
00601 case GB4bt:
00602 SUSPEND_OBUF(29); *out_p++ = getGB4bt0(next_info);
00603 SUSPEND_OBUF(30); *out_p++ = getGB4bt1(next_info);
00604 SUSPEND_OBUF(31); *out_p++ = getGB4bt2(next_info);
00605 SUSPEND_OBUF(32); *out_p++ = getGB4bt3(next_info);
00606 continue;
00607 case STR1:
00608 tc->output_index = 0;
00609 while (tc->output_index < STR1_LENGTH(BYTE_ADDR(STR1_BYTEINDEX(next_info)))) {
00610 SUSPEND_OBUF(28); *out_p++ = BYTE_ADDR(STR1_BYTEINDEX(next_info))[1+tc->output_index];
00611 tc->output_index++;
00612 }
00613 continue;
00614 case FUNii:
00615 next_info = (VALUE)(*tr->func_ii)(TRANSCODING_STATE(tc), next_info);
00616 goto follow_info;
00617 case FUNsi:
00618 {
00619 const unsigned char *char_start;
00620 size_t char_len;
00621 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00622 next_info = (VALUE)(*tr->func_si)(TRANSCODING_STATE(tc), char_start, (size_t)char_len);
00623 goto follow_info;
00624 }
00625 case FUNio:
00626 SUSPEND_OBUF(13);
00627 if (tr->max_output <= out_stop - out_p)
00628 out_p += tr->func_io(TRANSCODING_STATE(tc),
00629 next_info, out_p, out_stop - out_p);
00630 else {
00631 writebuf_len = tr->func_io(TRANSCODING_STATE(tc),
00632 next_info,
00633 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00634 writebuf_off = 0;
00635 while (writebuf_off < writebuf_len) {
00636 SUSPEND_OBUF(20);
00637 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00638 }
00639 }
00640 break;
00641 case FUNso:
00642 {
00643 const unsigned char *char_start;
00644 size_t char_len;
00645 SUSPEND_OBUF(14);
00646 if (tr->max_output <= out_stop - out_p) {
00647 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00648 out_p += tr->func_so(TRANSCODING_STATE(tc),
00649 char_start, (size_t)char_len,
00650 out_p, out_stop - out_p);
00651 }
00652 else {
00653 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00654 writebuf_len = tr->func_so(TRANSCODING_STATE(tc),
00655 char_start, (size_t)char_len,
00656 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00657 writebuf_off = 0;
00658 while (writebuf_off < writebuf_len) {
00659 SUSPEND_OBUF(22);
00660 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00661 }
00662 }
00663 break;
00664 }
00665 case FUNsio:
00666 {
00667 const unsigned char *char_start;
00668 size_t char_len;
00669 SUSPEND_OBUF(33);
00670 if (tr->max_output <= out_stop - out_p) {
00671 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00672 out_p += tr->func_sio(TRANSCODING_STATE(tc),
00673 char_start, (size_t)char_len, next_info,
00674 out_p, out_stop - out_p);
00675 }
00676 else {
00677 char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
00678 writebuf_len = tr->func_sio(TRANSCODING_STATE(tc),
00679 char_start, (size_t)char_len, next_info,
00680 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00681 writebuf_off = 0;
00682 while (writebuf_off < writebuf_len) {
00683 SUSPEND_OBUF(34);
00684 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00685 }
00686 }
00687 break;
00688 }
00689 case INVALID:
00690 if (tc->recognized_len + (in_p - inchar_start) <= unitlen) {
00691 if (tc->recognized_len + (in_p - inchar_start) < unitlen)
00692 SUSPEND_AFTER_OUTPUT(26);
00693 while ((opt & ECONV_PARTIAL_INPUT) && tc->recognized_len + (in_stop - inchar_start) < unitlen) {
00694 in_p = in_stop;
00695 SUSPEND(econv_source_buffer_empty, 8);
00696 }
00697 if (tc->recognized_len + (in_stop - inchar_start) <= unitlen) {
00698 in_p = in_stop;
00699 }
00700 else {
00701 in_p = inchar_start + (unitlen - tc->recognized_len);
00702 }
00703 }
00704 else {
00705 ssize_t invalid_len;
00706 ssize_t discard_len;
00707 invalid_len = tc->recognized_len + (in_p - inchar_start);
00708 discard_len = ((invalid_len - 1) / unitlen) * unitlen;
00709 readagain_len = invalid_len - discard_len;
00710 }
00711 goto invalid;
00712 case UNDEF:
00713 goto undef;
00714 default:
00715 rb_raise(rb_eRuntimeError, "unknown transcoding instruction");
00716 }
00717 continue;
00718
00719 invalid:
00720 SUSPEND(econv_invalid_byte_sequence, 1);
00721 continue;
00722
00723 incomplete:
00724 SUSPEND(econv_incomplete_input, 27);
00725 continue;
00726
00727 undef:
00728 SUSPEND(econv_undefined_conversion, 2);
00729 continue;
00730 }
00731
00732
00733 if (tr->finish_func) {
00734 SUSPEND_OBUF(4);
00735 if (tr->max_output <= out_stop - out_p) {
00736 out_p += tr->finish_func(TRANSCODING_STATE(tc),
00737 out_p, out_stop - out_p);
00738 }
00739 else {
00740 writebuf_len = tr->finish_func(TRANSCODING_STATE(tc),
00741 TRANSCODING_WRITEBUF(tc), TRANSCODING_WRITEBUF_SIZE(tc));
00742 writebuf_off = 0;
00743 while (writebuf_off < writebuf_len) {
00744 SUSPEND_OBUF(23);
00745 *out_p++ = TRANSCODING_WRITEBUF(tc)[writebuf_off++];
00746 }
00747 }
00748 }
00749 while (1)
00750 SUSPEND(econv_finished, 6);
00751 #undef SUSPEND
00752 #undef next_table
00753 #undef next_info
00754 #undef next_byte
00755 #undef writebuf_len
00756 #undef writebuf_off
00757 }
00758
00759 static rb_econv_result_t
00760 transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
00761 const unsigned char *in_stop, unsigned char *out_stop,
00762 rb_transcoding *tc,
00763 const int opt)
00764 {
00765 if (tc->readagain_len) {
00766 unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
00767 const unsigned char *readagain_pos = readagain_buf;
00768 const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
00769 rb_econv_result_t res;
00770
00771 MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
00772 unsigned char, tc->readagain_len);
00773 tc->readagain_len = 0;
00774 res = transcode_restartable0(&readagain_pos, out_pos, readagain_stop, out_stop, tc, opt|ECONV_PARTIAL_INPUT);
00775 if (res != econv_source_buffer_empty) {
00776 MEMCPY(TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len,
00777 readagain_pos, unsigned char, readagain_stop - readagain_pos);
00778 tc->readagain_len += readagain_stop - readagain_pos;
00779 return res;
00780 }
00781 }
00782 return transcode_restartable0(in_pos, out_pos, in_stop, out_stop, tc, opt);
00783 }
00784
00785 static rb_transcoding *
00786 rb_transcoding_open_by_transcoder(const rb_transcoder *tr, int flags)
00787 {
00788 rb_transcoding *tc;
00789
00790 tc = ALLOC(rb_transcoding);
00791 tc->transcoder = tr;
00792 tc->flags = flags;
00793 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00794 tc->state.ptr = xmalloc(tr->state_size);
00795 if (tr->state_init_func) {
00796 (tr->state_init_func)(TRANSCODING_STATE(tc));
00797 }
00798 tc->resume_position = 0;
00799 tc->recognized_len = 0;
00800 tc->readagain_len = 0;
00801 tc->writebuf_len = 0;
00802 tc->writebuf_off = 0;
00803 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00804 tc->readbuf.ptr = xmalloc(tr->max_input);
00805 }
00806 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00807 tc->writebuf.ptr = xmalloc(tr->max_output);
00808 }
00809 return tc;
00810 }
00811
00812 static rb_econv_result_t
00813 rb_transcoding_convert(rb_transcoding *tc,
00814 const unsigned char **input_ptr, const unsigned char *input_stop,
00815 unsigned char **output_ptr, unsigned char *output_stop,
00816 int flags)
00817 {
00818 return transcode_restartable(
00819 input_ptr, output_ptr,
00820 input_stop, output_stop,
00821 tc, flags);
00822 }
00823
00824 static void
00825 rb_transcoding_close(rb_transcoding *tc)
00826 {
00827 const rb_transcoder *tr = tc->transcoder;
00828 if (tr->state_fini_func) {
00829 (tr->state_fini_func)(TRANSCODING_STATE(tc));
00830 }
00831 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size)
00832 xfree(tc->state.ptr);
00833 if ((int)sizeof(tc->readbuf.ary) < tr->max_input)
00834 xfree(tc->readbuf.ptr);
00835 if ((int)sizeof(tc->writebuf.ary) < tr->max_output)
00836 xfree(tc->writebuf.ptr);
00837 xfree(tc);
00838 }
00839
00840 static size_t
00841 rb_transcoding_memsize(rb_transcoding *tc)
00842 {
00843 size_t size = sizeof(rb_transcoding);
00844 const rb_transcoder *tr = tc->transcoder;
00845
00846 if (TRANSCODING_STATE_EMBED_MAX < tr->state_size) {
00847 size += tr->state_size;
00848 }
00849 if ((int)sizeof(tc->readbuf.ary) < tr->max_input) {
00850 size += tr->max_input;
00851 }
00852 if ((int)sizeof(tc->writebuf.ary) < tr->max_output) {
00853 size += tr->max_output;
00854 }
00855 return size;
00856 }
00857
00858 static rb_econv_t *
00859 rb_econv_alloc(int n_hint)
00860 {
00861 rb_econv_t *ec;
00862
00863 if (n_hint <= 0)
00864 n_hint = 1;
00865
00866 ec = ALLOC(rb_econv_t);
00867 ec->flags = 0;
00868 ec->source_encoding_name = NULL;
00869 ec->destination_encoding_name = NULL;
00870 ec->started = 0;
00871 ec->replacement_str = NULL;
00872 ec->replacement_len = 0;
00873 ec->replacement_enc = NULL;
00874 ec->replacement_allocated = 0;
00875 ec->in_buf_start = NULL;
00876 ec->in_data_start = NULL;
00877 ec->in_data_end = NULL;
00878 ec->in_buf_end = NULL;
00879 ec->num_allocated = n_hint;
00880 ec->num_trans = 0;
00881 ec->elems = ALLOC_N(rb_econv_elem_t, ec->num_allocated);
00882 ec->num_finished = 0;
00883 ec->last_tc = NULL;
00884 ec->last_error.result = econv_source_buffer_empty;
00885 ec->last_error.error_tc = NULL;
00886 ec->last_error.source_encoding = NULL;
00887 ec->last_error.destination_encoding = NULL;
00888 ec->last_error.error_bytes_start = NULL;
00889 ec->last_error.error_bytes_len = 0;
00890 ec->last_error.readagain_len = 0;
00891 ec->source_encoding = NULL;
00892 ec->destination_encoding = NULL;
00893 return ec;
00894 }
00895
00896 static int
00897 rb_econv_add_transcoder_at(rb_econv_t *ec, const rb_transcoder *tr, int i)
00898 {
00899 int n, j;
00900 int bufsize = 4096;
00901 unsigned char *p;
00902
00903 if (ec->num_trans == ec->num_allocated) {
00904 n = ec->num_allocated * 2;
00905 REALLOC_N(ec->elems, rb_econv_elem_t, n);
00906 ec->num_allocated = n;
00907 }
00908
00909 p = xmalloc(bufsize);
00910
00911 MEMMOVE(ec->elems+i+1, ec->elems+i, rb_econv_elem_t, ec->num_trans-i);
00912
00913 ec->elems[i].tc = rb_transcoding_open_by_transcoder(tr, 0);
00914 ec->elems[i].out_buf_start = p;
00915 ec->elems[i].out_buf_end = p + bufsize;
00916 ec->elems[i].out_data_start = p;
00917 ec->elems[i].out_data_end = p;
00918 ec->elems[i].last_result = econv_source_buffer_empty;
00919
00920 ec->num_trans++;
00921
00922 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
00923 for (j = ec->num_trans-1; i <= j; j--) {
00924 rb_transcoding *tc = ec->elems[j].tc;
00925 const rb_transcoder *tr2 = tc->transcoder;
00926 if (!DECORATOR_P(tr2->src_encoding, tr2->dst_encoding)) {
00927 ec->last_tc = tc;
00928 break;
00929 }
00930 }
00931
00932 return 0;
00933 }
00934
00935 static rb_econv_t *
00936 rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
00937 {
00938 rb_econv_t *ec;
00939 int i, ret;
00940
00941 for (i = 0; i < n; i++) {
00942 const rb_transcoder *tr;
00943 tr = load_transcoder_entry(entries[i]);
00944 if (!tr)
00945 return NULL;
00946 }
00947
00948 ec = rb_econv_alloc(n);
00949
00950 for (i = 0; i < n; i++) {
00951 const rb_transcoder *tr = load_transcoder_entry(entries[i]);
00952 ret = rb_econv_add_transcoder_at(ec, tr, ec->num_trans);
00953 if (ret == -1) {
00954 rb_econv_close(ec);
00955 return NULL;
00956 }
00957 }
00958
00959 return ec;
00960 }
00961
00962 struct trans_open_t {
00963 transcoder_entry_t **entries;
00964 int num_additional;
00965 };
00966
00967 static void
00968 trans_open_i(const char *sname, const char *dname, int depth, void *arg)
00969 {
00970 struct trans_open_t *toarg = arg;
00971
00972 if (!toarg->entries) {
00973 toarg->entries = ALLOC_N(transcoder_entry_t *, depth+1+toarg->num_additional);
00974 }
00975 toarg->entries[depth] = get_transcoder_entry(sname, dname);
00976 }
00977
00978 static rb_econv_t *
00979 rb_econv_open0(const char *sname, const char *dname, int ecflags)
00980 {
00981 transcoder_entry_t **entries = NULL;
00982 int num_trans;
00983 rb_econv_t *ec;
00984
00985 rb_encoding *senc, *denc;
00986 int sidx, didx;
00987
00988 senc = NULL;
00989 if (*sname) {
00990 sidx = rb_enc_find_index(sname);
00991 if (0 <= sidx) {
00992 senc = rb_enc_from_index(sidx);
00993 }
00994 }
00995
00996 denc = NULL;
00997 if (*dname) {
00998 didx = rb_enc_find_index(dname);
00999 if (0 <= didx) {
01000 denc = rb_enc_from_index(didx);
01001 }
01002 }
01003
01004 if (*sname == '\0' && *dname == '\0') {
01005 num_trans = 0;
01006 entries = NULL;
01007 }
01008 else {
01009 struct trans_open_t toarg;
01010 toarg.entries = NULL;
01011 toarg.num_additional = 0;
01012 num_trans = transcode_search_path(sname, dname, trans_open_i, (void *)&toarg);
01013 entries = toarg.entries;
01014 if (num_trans < 0) {
01015 xfree(entries);
01016 return NULL;
01017 }
01018 }
01019
01020 ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
01021 xfree(entries);
01022 if (!ec)
01023 return NULL;
01024
01025 ec->flags = ecflags;
01026 ec->source_encoding_name = sname;
01027 ec->destination_encoding_name = dname;
01028
01029 return ec;
01030 }
01031
01032 #define MAX_ECFLAGS_DECORATORS 32
01033
01034 static int
01035 decorator_names(int ecflags, const char **decorators_ret)
01036 {
01037 int num_decorators;
01038
01039 switch (ecflags & ECONV_NEWLINE_DECORATOR_MASK) {
01040 case ECONV_UNIVERSAL_NEWLINE_DECORATOR:
01041 case ECONV_CRLF_NEWLINE_DECORATOR:
01042 case ECONV_CR_NEWLINE_DECORATOR:
01043 case 0:
01044 break;
01045 default:
01046 return -1;
01047 }
01048
01049 if ((ecflags & ECONV_XML_TEXT_DECORATOR) &&
01050 (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR))
01051 return -1;
01052
01053 num_decorators = 0;
01054
01055 if (ecflags & ECONV_XML_TEXT_DECORATOR)
01056 decorators_ret[num_decorators++] = "xml_text_escape";
01057 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR)
01058 decorators_ret[num_decorators++] = "xml_attr_content_escape";
01059 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR)
01060 decorators_ret[num_decorators++] = "xml_attr_quote";
01061
01062 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR)
01063 decorators_ret[num_decorators++] = "crlf_newline";
01064 if (ecflags & ECONV_CR_NEWLINE_DECORATOR)
01065 decorators_ret[num_decorators++] = "cr_newline";
01066 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR)
01067 decorators_ret[num_decorators++] = "universal_newline";
01068
01069 return num_decorators;
01070 }
01071
01072 rb_econv_t *
01073 rb_econv_open(const char *sname, const char *dname, int ecflags)
01074 {
01075 rb_econv_t *ec;
01076 int num_decorators;
01077 const char *decorators[MAX_ECFLAGS_DECORATORS];
01078 int i;
01079
01080 num_decorators = decorator_names(ecflags, decorators);
01081 if (num_decorators == -1)
01082 return NULL;
01083
01084 ec = rb_econv_open0(sname, dname, ecflags & ECONV_ERROR_HANDLER_MASK);
01085 if (!ec)
01086 return NULL;
01087
01088 for (i = 0; i < num_decorators; i++)
01089 if (rb_econv_decorate_at_last(ec, decorators[i]) == -1) {
01090 rb_econv_close(ec);
01091 return NULL;
01092 }
01093
01094 ec->flags |= ecflags & ~ECONV_ERROR_HANDLER_MASK;
01095
01096 return ec;
01097 }
01098
01099 static int
01100 trans_sweep(rb_econv_t *ec,
01101 const unsigned char **input_ptr, const unsigned char *input_stop,
01102 unsigned char **output_ptr, unsigned char *output_stop,
01103 int flags,
01104 int start)
01105 {
01106 int try;
01107 int i, f;
01108
01109 const unsigned char **ipp, *is, *iold;
01110 unsigned char **opp, *os, *oold;
01111 rb_econv_result_t res;
01112
01113 try = 1;
01114 while (try) {
01115 try = 0;
01116 for (i = start; i < ec->num_trans; i++) {
01117 rb_econv_elem_t *te = &ec->elems[i];
01118
01119 if (i == 0) {
01120 ipp = input_ptr;
01121 is = input_stop;
01122 }
01123 else {
01124 rb_econv_elem_t *prev_te = &ec->elems[i-1];
01125 ipp = (const unsigned char **)&prev_te->out_data_start;
01126 is = prev_te->out_data_end;
01127 }
01128
01129 if (i == ec->num_trans-1) {
01130 opp = output_ptr;
01131 os = output_stop;
01132 }
01133 else {
01134 if (te->out_buf_start != te->out_data_start) {
01135 ssize_t len = te->out_data_end - te->out_data_start;
01136 ssize_t off = te->out_data_start - te->out_buf_start;
01137 MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
01138 te->out_data_start = te->out_buf_start;
01139 te->out_data_end -= off;
01140 }
01141 opp = &te->out_data_end;
01142 os = te->out_buf_end;
01143 }
01144
01145 f = flags;
01146 if (ec->num_finished != i)
01147 f |= ECONV_PARTIAL_INPUT;
01148 if (i == 0 && (flags & ECONV_AFTER_OUTPUT)) {
01149 start = 1;
01150 flags &= ~ECONV_AFTER_OUTPUT;
01151 }
01152 if (i != 0)
01153 f &= ~ECONV_AFTER_OUTPUT;
01154 iold = *ipp;
01155 oold = *opp;
01156 te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
01157 if (iold != *ipp || oold != *opp)
01158 try = 1;
01159
01160 switch (res) {
01161 case econv_invalid_byte_sequence:
01162 case econv_incomplete_input:
01163 case econv_undefined_conversion:
01164 case econv_after_output:
01165 return i;
01166
01167 case econv_destination_buffer_full:
01168 case econv_source_buffer_empty:
01169 break;
01170
01171 case econv_finished:
01172 ec->num_finished = i+1;
01173 break;
01174 }
01175 }
01176 }
01177 return -1;
01178 }
01179
01180 static rb_econv_result_t
01181 rb_trans_conv(rb_econv_t *ec,
01182 const unsigned char **input_ptr, const unsigned char *input_stop,
01183 unsigned char **output_ptr, unsigned char *output_stop,
01184 int flags,
01185 int *result_position_ptr)
01186 {
01187 int i;
01188 int needreport_index;
01189 int sweep_start;
01190
01191 unsigned char empty_buf;
01192 unsigned char *empty_ptr = &empty_buf;
01193
01194 if (!input_ptr) {
01195 input_ptr = (const unsigned char **)&empty_ptr;
01196 input_stop = empty_ptr;
01197 }
01198
01199 if (!output_ptr) {
01200 output_ptr = &empty_ptr;
01201 output_stop = empty_ptr;
01202 }
01203
01204 if (ec->elems[0].last_result == econv_after_output)
01205 ec->elems[0].last_result = econv_source_buffer_empty;
01206
01207 needreport_index = -1;
01208 for (i = ec->num_trans-1; 0 <= i; i--) {
01209 switch (ec->elems[i].last_result) {
01210 case econv_invalid_byte_sequence:
01211 case econv_incomplete_input:
01212 case econv_undefined_conversion:
01213 case econv_after_output:
01214 case econv_finished:
01215 sweep_start = i+1;
01216 needreport_index = i;
01217 goto found_needreport;
01218
01219 case econv_destination_buffer_full:
01220 case econv_source_buffer_empty:
01221 break;
01222
01223 default:
01224 rb_bug("unexpected transcode last result");
01225 }
01226 }
01227
01228
01229
01230 if (ec->elems[ec->num_trans-1].last_result == econv_destination_buffer_full &&
01231 (flags & ECONV_AFTER_OUTPUT)) {
01232 rb_econv_result_t res;
01233
01234 res = rb_trans_conv(ec, NULL, NULL, output_ptr, output_stop,
01235 (flags & ~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT,
01236 result_position_ptr);
01237
01238 if (res == econv_source_buffer_empty)
01239 return econv_after_output;
01240 return res;
01241 }
01242
01243 sweep_start = 0;
01244
01245 found_needreport:
01246
01247 do {
01248 needreport_index = trans_sweep(ec, input_ptr, input_stop, output_ptr, output_stop, flags, sweep_start);
01249 sweep_start = needreport_index + 1;
01250 } while (needreport_index != -1 && needreport_index != ec->num_trans-1);
01251
01252 for (i = ec->num_trans-1; 0 <= i; i--) {
01253 if (ec->elems[i].last_result != econv_source_buffer_empty) {
01254 rb_econv_result_t res = ec->elems[i].last_result;
01255 if (res == econv_invalid_byte_sequence ||
01256 res == econv_incomplete_input ||
01257 res == econv_undefined_conversion ||
01258 res == econv_after_output) {
01259 ec->elems[i].last_result = econv_source_buffer_empty;
01260 }
01261 if (result_position_ptr)
01262 *result_position_ptr = i;
01263 return res;
01264 }
01265 }
01266 if (result_position_ptr)
01267 *result_position_ptr = -1;
01268 return econv_source_buffer_empty;
01269 }
01270
01271 static rb_econv_result_t
01272 rb_econv_convert0(rb_econv_t *ec,
01273 const unsigned char **input_ptr, const unsigned char *input_stop,
01274 unsigned char **output_ptr, unsigned char *output_stop,
01275 int flags)
01276 {
01277 rb_econv_result_t res;
01278 int result_position;
01279 int has_output = 0;
01280
01281 memset(&ec->last_error, 0, sizeof(ec->last_error));
01282
01283 if (ec->num_trans == 0) {
01284 size_t len;
01285 if (ec->in_buf_start && ec->in_data_start != ec->in_data_end) {
01286 if (output_stop - *output_ptr < ec->in_data_end - ec->in_data_start) {
01287 len = output_stop - *output_ptr;
01288 memcpy(*output_ptr, ec->in_data_start, len);
01289 *output_ptr = output_stop;
01290 ec->in_data_start += len;
01291 res = econv_destination_buffer_full;
01292 goto gotresult;
01293 }
01294 len = ec->in_data_end - ec->in_data_start;
01295 memcpy(*output_ptr, ec->in_data_start, len);
01296 *output_ptr += len;
01297 ec->in_data_start = ec->in_data_end = ec->in_buf_start;
01298 if (flags & ECONV_AFTER_OUTPUT) {
01299 res = econv_after_output;
01300 goto gotresult;
01301 }
01302 }
01303 if (output_stop - *output_ptr < input_stop - *input_ptr) {
01304 len = output_stop - *output_ptr;
01305 }
01306 else {
01307 len = input_stop - *input_ptr;
01308 }
01309 if (0 < len && (flags & ECONV_AFTER_OUTPUT)) {
01310 *(*output_ptr)++ = *(*input_ptr)++;
01311 res = econv_after_output;
01312 goto gotresult;
01313 }
01314 memcpy(*output_ptr, *input_ptr, len);
01315 *output_ptr += len;
01316 *input_ptr += len;
01317 if (*input_ptr != input_stop)
01318 res = econv_destination_buffer_full;
01319 else if (flags & ECONV_PARTIAL_INPUT)
01320 res = econv_source_buffer_empty;
01321 else
01322 res = econv_finished;
01323 goto gotresult;
01324 }
01325
01326 if (ec->elems[ec->num_trans-1].out_data_start) {
01327 unsigned char *data_start = ec->elems[ec->num_trans-1].out_data_start;
01328 unsigned char *data_end = ec->elems[ec->num_trans-1].out_data_end;
01329 if (data_start != data_end) {
01330 size_t len;
01331 if (output_stop - *output_ptr < data_end - data_start) {
01332 len = output_stop - *output_ptr;
01333 memcpy(*output_ptr, data_start, len);
01334 *output_ptr = output_stop;
01335 ec->elems[ec->num_trans-1].out_data_start += len;
01336 res = econv_destination_buffer_full;
01337 goto gotresult;
01338 }
01339 len = data_end - data_start;
01340 memcpy(*output_ptr, data_start, len);
01341 *output_ptr += len;
01342 ec->elems[ec->num_trans-1].out_data_start =
01343 ec->elems[ec->num_trans-1].out_data_end =
01344 ec->elems[ec->num_trans-1].out_buf_start;
01345 has_output = 1;
01346 }
01347 }
01348
01349 if (ec->in_buf_start &&
01350 ec->in_data_start != ec->in_data_end) {
01351 res = rb_trans_conv(ec, (const unsigned char **)&ec->in_data_start, ec->in_data_end, output_ptr, output_stop,
01352 (flags&~ECONV_AFTER_OUTPUT)|ECONV_PARTIAL_INPUT, &result_position);
01353 if (res != econv_source_buffer_empty)
01354 goto gotresult;
01355 }
01356
01357 if (has_output &&
01358 (flags & ECONV_AFTER_OUTPUT) &&
01359 *input_ptr != input_stop) {
01360 input_stop = *input_ptr;
01361 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01362 if (res == econv_source_buffer_empty)
01363 res = econv_after_output;
01364 }
01365 else if ((flags & ECONV_AFTER_OUTPUT) ||
01366 ec->num_trans == 1) {
01367 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01368 }
01369 else {
01370 flags |= ECONV_AFTER_OUTPUT;
01371 do {
01372 res = rb_trans_conv(ec, input_ptr, input_stop, output_ptr, output_stop, flags, &result_position);
01373 } while (res == econv_after_output);
01374 }
01375
01376 gotresult:
01377 ec->last_error.result = res;
01378 if (res == econv_invalid_byte_sequence ||
01379 res == econv_incomplete_input ||
01380 res == econv_undefined_conversion) {
01381 rb_transcoding *error_tc = ec->elems[result_position].tc;
01382 ec->last_error.error_tc = error_tc;
01383 ec->last_error.source_encoding = error_tc->transcoder->src_encoding;
01384 ec->last_error.destination_encoding = error_tc->transcoder->dst_encoding;
01385 ec->last_error.error_bytes_start = TRANSCODING_READBUF(error_tc);
01386 ec->last_error.error_bytes_len = error_tc->recognized_len;
01387 ec->last_error.readagain_len = error_tc->readagain_len;
01388 }
01389
01390 return res;
01391 }
01392
01393 static int output_replacement_character(rb_econv_t *ec);
01394
01395 static int
01396 output_hex_charref(rb_econv_t *ec)
01397 {
01398 int ret;
01399 unsigned char utfbuf[1024];
01400 const unsigned char *utf;
01401 size_t utf_len;
01402 int utf_allocated = 0;
01403 char charef_buf[16];
01404 const unsigned char *p;
01405
01406 if (encoding_equal(ec->last_error.source_encoding, "UTF-32BE")) {
01407 utf = ec->last_error.error_bytes_start;
01408 utf_len = ec->last_error.error_bytes_len;
01409 }
01410 else {
01411 utf = allocate_converted_string(ec->last_error.source_encoding, "UTF-32BE",
01412 ec->last_error.error_bytes_start, ec->last_error.error_bytes_len,
01413 utfbuf, sizeof(utfbuf),
01414 &utf_len);
01415 if (!utf)
01416 return -1;
01417 if (utf != utfbuf && utf != ec->last_error.error_bytes_start)
01418 utf_allocated = 1;
01419 }
01420
01421 if (utf_len % 4 != 0)
01422 goto fail;
01423
01424 p = utf;
01425 while (4 <= utf_len) {
01426 unsigned int u = 0;
01427 u += p[0] << 24;
01428 u += p[1] << 16;
01429 u += p[2] << 8;
01430 u += p[3];
01431 snprintf(charef_buf, sizeof(charef_buf), "&#x%X;", u);
01432
01433 ret = rb_econv_insert_output(ec, (unsigned char *)charef_buf, strlen(charef_buf), "US-ASCII");
01434 if (ret == -1)
01435 goto fail;
01436
01437 p += 4;
01438 utf_len -= 4;
01439 }
01440
01441 if (utf_allocated)
01442 xfree((void *)utf);
01443 return 0;
01444
01445 fail:
01446 if (utf_allocated)
01447 xfree((void *)utf);
01448 return -1;
01449 }
01450
01451 rb_econv_result_t
01452 rb_econv_convert(rb_econv_t *ec,
01453 const unsigned char **input_ptr, const unsigned char *input_stop,
01454 unsigned char **output_ptr, unsigned char *output_stop,
01455 int flags)
01456 {
01457 rb_econv_result_t ret;
01458
01459 unsigned char empty_buf;
01460 unsigned char *empty_ptr = &empty_buf;
01461
01462 ec->started = 1;
01463
01464 if (!input_ptr) {
01465 input_ptr = (const unsigned char **)&empty_ptr;
01466 input_stop = empty_ptr;
01467 }
01468
01469 if (!output_ptr) {
01470 output_ptr = &empty_ptr;
01471 output_stop = empty_ptr;
01472 }
01473
01474 resume:
01475 ret = rb_econv_convert0(ec, input_ptr, input_stop, output_ptr, output_stop, flags);
01476
01477 if (ret == econv_invalid_byte_sequence ||
01478 ret == econv_incomplete_input) {
01479
01480
01481 switch (ec->flags & ECONV_INVALID_MASK) {
01482 case ECONV_INVALID_REPLACE:
01483 if (output_replacement_character(ec) == 0)
01484 goto resume;
01485 }
01486 }
01487
01488 if (ret == econv_undefined_conversion) {
01489
01490
01491
01492 switch (ec->flags & ECONV_UNDEF_MASK) {
01493 case ECONV_UNDEF_REPLACE:
01494 if (output_replacement_character(ec) == 0)
01495 goto resume;
01496 break;
01497
01498 case ECONV_UNDEF_HEX_CHARREF:
01499 if (output_hex_charref(ec) == 0)
01500 goto resume;
01501 break;
01502 }
01503 }
01504
01505 return ret;
01506 }
01507
01508 const char *
01509 rb_econv_encoding_to_insert_output(rb_econv_t *ec)
01510 {
01511 rb_transcoding *tc = ec->last_tc;
01512 const rb_transcoder *tr;
01513
01514 if (tc == NULL)
01515 return "";
01516
01517 tr = tc->transcoder;
01518
01519 if (tr->asciicompat_type == asciicompat_encoder)
01520 return tr->src_encoding;
01521 return tr->dst_encoding;
01522 }
01523
01524 static unsigned char *
01525 allocate_converted_string(const char *sname, const char *dname,
01526 const unsigned char *str, size_t len,
01527 unsigned char *caller_dst_buf, size_t caller_dst_bufsize,
01528 size_t *dst_len_ptr)
01529 {
01530 unsigned char *dst_str;
01531 size_t dst_len;
01532 size_t dst_bufsize;
01533
01534 rb_econv_t *ec;
01535 rb_econv_result_t res;
01536
01537 const unsigned char *sp;
01538 unsigned char *dp;
01539
01540 if (caller_dst_buf)
01541 dst_bufsize = caller_dst_bufsize;
01542 else if (len == 0)
01543 dst_bufsize = 1;
01544 else
01545 dst_bufsize = len;
01546
01547 ec = rb_econv_open(sname, dname, 0);
01548 if (ec == NULL)
01549 return NULL;
01550 if (caller_dst_buf)
01551 dst_str = caller_dst_buf;
01552 else
01553 dst_str = xmalloc(dst_bufsize);
01554 dst_len = 0;
01555 sp = str;
01556 dp = dst_str+dst_len;
01557 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01558 dst_len = dp - dst_str;
01559 while (res == econv_destination_buffer_full) {
01560 if (SIZE_MAX/2 < dst_bufsize) {
01561 goto fail;
01562 }
01563 dst_bufsize *= 2;
01564 if (dst_str == caller_dst_buf) {
01565 unsigned char *tmp;
01566 tmp = xmalloc(dst_bufsize);
01567 memcpy(tmp, dst_str, dst_bufsize/2);
01568 dst_str = tmp;
01569 }
01570 else {
01571 dst_str = xrealloc(dst_str, dst_bufsize);
01572 }
01573 dp = dst_str+dst_len;
01574 res = rb_econv_convert(ec, &sp, str+len, &dp, dst_str+dst_bufsize, 0);
01575 dst_len = dp - dst_str;
01576 }
01577 if (res != econv_finished) {
01578 goto fail;
01579 }
01580 rb_econv_close(ec);
01581 *dst_len_ptr = dst_len;
01582 return dst_str;
01583
01584 fail:
01585 if (dst_str != caller_dst_buf)
01586 xfree(dst_str);
01587 rb_econv_close(ec);
01588 return NULL;
01589 }
01590
01591
01592 int
01593 rb_econv_insert_output(rb_econv_t *ec,
01594 const unsigned char *str, size_t len, const char *str_encoding)
01595 {
01596 const char *insert_encoding = rb_econv_encoding_to_insert_output(ec);
01597 unsigned char insert_buf[4096];
01598 const unsigned char *insert_str = NULL;
01599 size_t insert_len;
01600
01601 int last_trans_index;
01602 rb_transcoding *tc;
01603
01604 unsigned char **buf_start_p;
01605 unsigned char **data_start_p;
01606 unsigned char **data_end_p;
01607 unsigned char **buf_end_p;
01608
01609 size_t need;
01610
01611 ec->started = 1;
01612
01613 if (len == 0)
01614 return 0;
01615
01616 if (encoding_equal(insert_encoding, str_encoding)) {
01617 insert_str = str;
01618 insert_len = len;
01619 }
01620 else {
01621 insert_str = allocate_converted_string(str_encoding, insert_encoding,
01622 str, len, insert_buf, sizeof(insert_buf), &insert_len);
01623 if (insert_str == NULL)
01624 return -1;
01625 }
01626
01627 need = insert_len;
01628
01629 last_trans_index = ec->num_trans-1;
01630 if (ec->num_trans == 0) {
01631 tc = NULL;
01632 buf_start_p = &ec->in_buf_start;
01633 data_start_p = &ec->in_data_start;
01634 data_end_p = &ec->in_data_end;
01635 buf_end_p = &ec->in_buf_end;
01636 }
01637 else if (ec->elems[last_trans_index].tc->transcoder->asciicompat_type == asciicompat_encoder) {
01638 tc = ec->elems[last_trans_index].tc;
01639 need += tc->readagain_len;
01640 if (need < insert_len)
01641 goto fail;
01642 if (last_trans_index == 0) {
01643 buf_start_p = &ec->in_buf_start;
01644 data_start_p = &ec->in_data_start;
01645 data_end_p = &ec->in_data_end;
01646 buf_end_p = &ec->in_buf_end;
01647 }
01648 else {
01649 rb_econv_elem_t *ee = &ec->elems[last_trans_index-1];
01650 buf_start_p = &ee->out_buf_start;
01651 data_start_p = &ee->out_data_start;
01652 data_end_p = &ee->out_data_end;
01653 buf_end_p = &ee->out_buf_end;
01654 }
01655 }
01656 else {
01657 rb_econv_elem_t *ee = &ec->elems[last_trans_index];
01658 buf_start_p = &ee->out_buf_start;
01659 data_start_p = &ee->out_data_start;
01660 data_end_p = &ee->out_data_end;
01661 buf_end_p = &ee->out_buf_end;
01662 tc = ec->elems[last_trans_index].tc;
01663 }
01664
01665 if (*buf_start_p == NULL) {
01666 unsigned char *buf = xmalloc(need);
01667 *buf_start_p = buf;
01668 *data_start_p = buf;
01669 *data_end_p = buf;
01670 *buf_end_p = buf+need;
01671 }
01672 else if ((size_t)(*buf_end_p - *data_end_p) < need) {
01673 MEMMOVE(*buf_start_p, *data_start_p, unsigned char, *data_end_p - *data_start_p);
01674 *data_end_p = *buf_start_p + (*data_end_p - *data_start_p);
01675 *data_start_p = *buf_start_p;
01676 if ((size_t)(*buf_end_p - *data_end_p) < need) {
01677 unsigned char *buf;
01678 size_t s = (*data_end_p - *buf_start_p) + need;
01679 if (s < need)
01680 goto fail;
01681 buf = xrealloc(*buf_start_p, s);
01682 *data_start_p = buf;
01683 *data_end_p = buf + (*data_end_p - *buf_start_p);
01684 *buf_start_p = buf;
01685 *buf_end_p = buf + s;
01686 }
01687 }
01688
01689 memcpy(*data_end_p, insert_str, insert_len);
01690 *data_end_p += insert_len;
01691 if (tc && tc->transcoder->asciicompat_type == asciicompat_encoder) {
01692 memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len);
01693 *data_end_p += tc->readagain_len;
01694 tc->readagain_len = 0;
01695 }
01696
01697 if (insert_str != str && insert_str != insert_buf)
01698 xfree((void*)insert_str);
01699 return 0;
01700
01701 fail:
01702 if (insert_str != str && insert_str != insert_buf)
01703 xfree((void*)insert_str);
01704 return -1;
01705 }
01706
01707 void
01708 rb_econv_close(rb_econv_t *ec)
01709 {
01710 int i;
01711
01712 if (ec->replacement_allocated) {
01713 xfree((void *)ec->replacement_str);
01714 }
01715 for (i = 0; i < ec->num_trans; i++) {
01716 rb_transcoding_close(ec->elems[i].tc);
01717 if (ec->elems[i].out_buf_start)
01718 xfree(ec->elems[i].out_buf_start);
01719 }
01720 xfree(ec->in_buf_start);
01721 xfree(ec->elems);
01722 xfree(ec);
01723 }
01724
01725 size_t
01726 rb_econv_memsize(rb_econv_t *ec)
01727 {
01728 size_t size = sizeof(rb_econv_t);
01729 int i;
01730
01731 if (ec->replacement_allocated) {
01732 size += ec->replacement_len;
01733 }
01734 for (i = 0; i < ec->num_trans; i++) {
01735 size += rb_transcoding_memsize(ec->elems[i].tc);
01736
01737 if (ec->elems[i].out_buf_start) {
01738 size += ec->elems[i].out_buf_end - ec->elems[i].out_buf_start;
01739 }
01740 }
01741 size += ec->in_buf_end - ec->in_buf_start;
01742 size += sizeof(rb_econv_elem_t) * ec->num_allocated;
01743
01744 return size;
01745 }
01746
01747 int
01748 rb_econv_putbackable(rb_econv_t *ec)
01749 {
01750 if (ec->num_trans == 0)
01751 return 0;
01752 #if SIZEOF_SIZE_T > SIZEOF_INT
01753 if (ec->elems[0].tc->readagain_len > INT_MAX) return INT_MAX;
01754 #endif
01755 return (int)ec->elems[0].tc->readagain_len;
01756 }
01757
01758 void
01759 rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n)
01760 {
01761 rb_transcoding *tc;
01762 if (ec->num_trans == 0 || n == 0)
01763 return;
01764 tc = ec->elems[0].tc;
01765 memcpy(p, TRANSCODING_READBUF(tc) + tc->recognized_len + tc->readagain_len - n, n);
01766 tc->readagain_len -= n;
01767 }
01768
01769 struct asciicompat_encoding_t {
01770 const char *ascii_compat_name;
01771 const char *ascii_incompat_name;
01772 };
01773
01774 static int
01775 asciicompat_encoding_i(st_data_t key, st_data_t val, st_data_t arg)
01776 {
01777 struct asciicompat_encoding_t *data = (struct asciicompat_encoding_t *)arg;
01778 transcoder_entry_t *entry = (transcoder_entry_t *)val;
01779 const rb_transcoder *tr;
01780
01781 if (DECORATOR_P(entry->sname, entry->dname))
01782 return ST_CONTINUE;
01783 tr = load_transcoder_entry(entry);
01784 if (tr && tr->asciicompat_type == asciicompat_decoder) {
01785 data->ascii_compat_name = tr->dst_encoding;
01786 return ST_STOP;
01787 }
01788 return ST_CONTINUE;
01789 }
01790
01791 const char *
01792 rb_econv_asciicompat_encoding(const char *ascii_incompat_name)
01793 {
01794 st_data_t v;
01795 st_table *table2;
01796 struct asciicompat_encoding_t data;
01797
01798 if (!st_lookup(transcoder_table, (st_data_t)ascii_incompat_name, &v))
01799 return NULL;
01800 table2 = (st_table *)v;
01801
01802
01803
01804
01805
01806
01807
01808
01809 if (table2->num_entries != 1)
01810 return NULL;
01811
01812 data.ascii_incompat_name = ascii_incompat_name;
01813 data.ascii_compat_name = NULL;
01814 st_foreach(table2, asciicompat_encoding_i, (st_data_t)&data);
01815 return data.ascii_compat_name;
01816 }
01817
01818 VALUE
01819 rb_econv_substr_append(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags)
01820 {
01821 unsigned const char *ss, *sp, *se;
01822 unsigned char *ds, *dp, *de;
01823 rb_econv_result_t res;
01824 int max_output;
01825
01826 if (NIL_P(dst)) {
01827 dst = rb_str_buf_new(len);
01828 if (ec->destination_encoding)
01829 rb_enc_associate(dst, ec->destination_encoding);
01830 }
01831
01832 if (ec->last_tc)
01833 max_output = ec->last_tc->transcoder->max_output;
01834 else
01835 max_output = 1;
01836
01837 res = econv_destination_buffer_full;
01838 while (res == econv_destination_buffer_full) {
01839 long dlen = RSTRING_LEN(dst);
01840 if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) {
01841 unsigned long new_capa = (unsigned long)dlen + len + max_output;
01842 if (LONG_MAX < new_capa)
01843 rb_raise(rb_eArgError, "too long string");
01844 rb_str_resize(dst, new_capa);
01845 rb_str_set_len(dst, dlen);
01846 }
01847 ss = sp = (const unsigned char *)RSTRING_PTR(src) + off;
01848 se = ss + len;
01849 ds = (unsigned char *)RSTRING_PTR(dst);
01850 de = ds + rb_str_capacity(dst);
01851 dp = ds += dlen;
01852 res = rb_econv_convert(ec, &sp, se, &dp, de, flags);
01853 off += sp - ss;
01854 len -= sp - ss;
01855 rb_str_set_len(dst, dlen + (dp - ds));
01856 rb_econv_check_error(ec);
01857 }
01858
01859 return dst;
01860 }
01861
01862 VALUE
01863 rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags)
01864 {
01865 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), dst, flags);
01866 }
01867
01868 VALUE
01869 rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags)
01870 {
01871 return rb_econv_substr_append(ec, src, byteoff, bytesize, Qnil, flags);
01872 }
01873
01874 VALUE
01875 rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags)
01876 {
01877 return rb_econv_substr_append(ec, src, 0, RSTRING_LEN(src), Qnil, flags);
01878 }
01879
01880 static int
01881 rb_econv_add_converter(rb_econv_t *ec, const char *sname, const char *dname, int n)
01882 {
01883 transcoder_entry_t *entry;
01884 const rb_transcoder *tr;
01885
01886 if (ec->started != 0)
01887 return -1;
01888
01889 entry = get_transcoder_entry(sname, dname);
01890 if (!entry)
01891 return -1;
01892
01893 tr = load_transcoder_entry(entry);
01894 if (!tr) return -1;
01895
01896 return rb_econv_add_transcoder_at(ec, tr, n);
01897 }
01898
01899 static int
01900 rb_econv_decorate_at(rb_econv_t *ec, const char *decorator_name, int n)
01901 {
01902 return rb_econv_add_converter(ec, "", decorator_name, n);
01903 }
01904
01905 int
01906 rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name)
01907 {
01908 const rb_transcoder *tr;
01909
01910 if (ec->num_trans == 0)
01911 return rb_econv_decorate_at(ec, decorator_name, 0);
01912
01913 tr = ec->elems[0].tc->transcoder;
01914
01915 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01916 tr->asciicompat_type == asciicompat_decoder)
01917 return rb_econv_decorate_at(ec, decorator_name, 1);
01918
01919 return rb_econv_decorate_at(ec, decorator_name, 0);
01920 }
01921
01922 int
01923 rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name)
01924 {
01925 const rb_transcoder *tr;
01926
01927 if (ec->num_trans == 0)
01928 return rb_econv_decorate_at(ec, decorator_name, 0);
01929
01930 tr = ec->elems[ec->num_trans-1].tc->transcoder;
01931
01932 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
01933 tr->asciicompat_type == asciicompat_encoder)
01934 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans-1);
01935
01936 return rb_econv_decorate_at(ec, decorator_name, ec->num_trans);
01937 }
01938
01939 void
01940 rb_econv_binmode(rb_econv_t *ec)
01941 {
01942 const rb_transcoder *trs[3];
01943 int n, i, j;
01944 transcoder_entry_t *entry;
01945 int num_trans;
01946
01947 n = 0;
01948 if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
01949 entry = get_transcoder_entry("", "universal_newline");
01950 if (entry->transcoder)
01951 trs[n++] = entry->transcoder;
01952 }
01953 if (ec->flags & ECONV_CRLF_NEWLINE_DECORATOR) {
01954 entry = get_transcoder_entry("", "crlf_newline");
01955 if (entry->transcoder)
01956 trs[n++] = entry->transcoder;
01957 }
01958 if (ec->flags & ECONV_CR_NEWLINE_DECORATOR) {
01959 entry = get_transcoder_entry("", "cr_newline");
01960 if (entry->transcoder)
01961 trs[n++] = entry->transcoder;
01962 }
01963
01964 num_trans = ec->num_trans;
01965 j = 0;
01966 for (i = 0; i < num_trans; i++) {
01967 int k;
01968 for (k = 0; k < n; k++)
01969 if (trs[k] == ec->elems[i].tc->transcoder)
01970 break;
01971 if (k == n) {
01972 ec->elems[j] = ec->elems[i];
01973 j++;
01974 }
01975 else {
01976 rb_transcoding_close(ec->elems[i].tc);
01977 xfree(ec->elems[i].out_buf_start);
01978 ec->num_trans--;
01979 }
01980 }
01981
01982 ec->flags &= ~ECONV_NEWLINE_DECORATOR_MASK;
01983
01984 }
01985
01986 static VALUE
01987 econv_description(const char *sname, const char *dname, int ecflags, VALUE mesg)
01988 {
01989 int has_description = 0;
01990
01991 if (NIL_P(mesg))
01992 mesg = rb_str_new(NULL, 0);
01993
01994 if (*sname != '\0' || *dname != '\0') {
01995 if (*sname == '\0')
01996 rb_str_cat2(mesg, dname);
01997 else if (*dname == '\0')
01998 rb_str_cat2(mesg, sname);
01999 else
02000 rb_str_catf(mesg, "%s to %s", sname, dname);
02001 has_description = 1;
02002 }
02003
02004 if (ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02005 ECONV_XML_TEXT_DECORATOR|
02006 ECONV_XML_ATTR_CONTENT_DECORATOR|
02007 ECONV_XML_ATTR_QUOTE_DECORATOR)) {
02008 const char *pre = "";
02009 if (has_description)
02010 rb_str_cat2(mesg, " with ");
02011 if (ecflags & ECONV_UNIVERSAL_NEWLINE_DECORATOR) {
02012 rb_str_cat2(mesg, pre); pre = ",";
02013 rb_str_cat2(mesg, "universal_newline");
02014 }
02015 if (ecflags & ECONV_CRLF_NEWLINE_DECORATOR) {
02016 rb_str_cat2(mesg, pre); pre = ",";
02017 rb_str_cat2(mesg, "crlf_newline");
02018 }
02019 if (ecflags & ECONV_CR_NEWLINE_DECORATOR) {
02020 rb_str_cat2(mesg, pre); pre = ",";
02021 rb_str_cat2(mesg, "cr_newline");
02022 }
02023 if (ecflags & ECONV_XML_TEXT_DECORATOR) {
02024 rb_str_cat2(mesg, pre); pre = ",";
02025 rb_str_cat2(mesg, "xml_text");
02026 }
02027 if (ecflags & ECONV_XML_ATTR_CONTENT_DECORATOR) {
02028 rb_str_cat2(mesg, pre); pre = ",";
02029 rb_str_cat2(mesg, "xml_attr_content");
02030 }
02031 if (ecflags & ECONV_XML_ATTR_QUOTE_DECORATOR) {
02032 rb_str_cat2(mesg, pre); pre = ",";
02033 rb_str_cat2(mesg, "xml_attr_quote");
02034 }
02035 has_description = 1;
02036 }
02037 if (!has_description) {
02038 rb_str_cat2(mesg, "no-conversion");
02039 }
02040
02041 return mesg;
02042 }
02043
02044 VALUE
02045 rb_econv_open_exc(const char *sname, const char *dname, int ecflags)
02046 {
02047 VALUE mesg, exc;
02048 mesg = rb_str_new_cstr("code converter not found (");
02049 econv_description(sname, dname, ecflags, mesg);
02050 rb_str_cat2(mesg, ")");
02051 exc = rb_exc_new3(rb_eConverterNotFoundError, mesg);
02052 return exc;
02053 }
02054
02055 static VALUE
02056 make_econv_exception(rb_econv_t *ec)
02057 {
02058 VALUE mesg, exc;
02059 if (ec->last_error.result == econv_invalid_byte_sequence ||
02060 ec->last_error.result == econv_incomplete_input) {
02061 const char *err = (const char *)ec->last_error.error_bytes_start;
02062 size_t error_len = ec->last_error.error_bytes_len;
02063 VALUE bytes = rb_str_new(err, error_len);
02064 VALUE dumped = rb_str_dump(bytes);
02065 size_t readagain_len = ec->last_error.readagain_len;
02066 VALUE bytes2 = Qnil;
02067 VALUE dumped2;
02068 int idx;
02069 if (ec->last_error.result == econv_incomplete_input) {
02070 mesg = rb_sprintf("incomplete %s on %s",
02071 StringValueCStr(dumped),
02072 ec->last_error.source_encoding);
02073 }
02074 else if (readagain_len) {
02075 bytes2 = rb_str_new(err+error_len, readagain_len);
02076 dumped2 = rb_str_dump(bytes2);
02077 mesg = rb_sprintf("%s followed by %s on %s",
02078 StringValueCStr(dumped),
02079 StringValueCStr(dumped2),
02080 ec->last_error.source_encoding);
02081 }
02082 else {
02083 mesg = rb_sprintf("%s on %s",
02084 StringValueCStr(dumped),
02085 ec->last_error.source_encoding);
02086 }
02087
02088 exc = rb_exc_new3(rb_eInvalidByteSequenceError, mesg);
02089 rb_ivar_set(exc, rb_intern("error_bytes"), bytes);
02090 rb_ivar_set(exc, rb_intern("readagain_bytes"), bytes2);
02091 rb_ivar_set(exc, rb_intern("incomplete_input"), ec->last_error.result == econv_incomplete_input ? Qtrue : Qfalse);
02092
02093 set_encs:
02094 rb_ivar_set(exc, rb_intern("source_encoding_name"), rb_str_new2(ec->last_error.source_encoding));
02095 rb_ivar_set(exc, rb_intern("destination_encoding_name"), rb_str_new2(ec->last_error.destination_encoding));
02096 idx = rb_enc_find_index(ec->last_error.source_encoding);
02097 if (0 <= idx)
02098 rb_ivar_set(exc, rb_intern("source_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02099 idx = rb_enc_find_index(ec->last_error.destination_encoding);
02100 if (0 <= idx)
02101 rb_ivar_set(exc, rb_intern("destination_encoding"), rb_enc_from_encoding(rb_enc_from_index(idx)));
02102 return exc;
02103 }
02104 if (ec->last_error.result == econv_undefined_conversion) {
02105 VALUE bytes = rb_str_new((const char *)ec->last_error.error_bytes_start,
02106 ec->last_error.error_bytes_len);
02107 VALUE dumped = Qnil;
02108 int idx;
02109 if (strcmp(ec->last_error.source_encoding, "UTF-8") == 0) {
02110 rb_encoding *utf8 = rb_utf8_encoding();
02111 const char *start, *end;
02112 int n;
02113 start = (const char *)ec->last_error.error_bytes_start;
02114 end = start + ec->last_error.error_bytes_len;
02115 n = rb_enc_precise_mbclen(start, end, utf8);
02116 if (MBCLEN_CHARFOUND_P(n) &&
02117 (size_t)MBCLEN_CHARFOUND_LEN(n) == ec->last_error.error_bytes_len) {
02118 unsigned int cc = rb_enc_mbc_to_codepoint(start, end, utf8);
02119 dumped = rb_sprintf("U+%04X", cc);
02120 }
02121 }
02122 if (dumped == Qnil)
02123 dumped = rb_str_dump(bytes);
02124 if (strcmp(ec->last_error.source_encoding,
02125 ec->source_encoding_name) == 0 &&
02126 strcmp(ec->last_error.destination_encoding,
02127 ec->destination_encoding_name) == 0) {
02128 mesg = rb_sprintf("%s from %s to %s",
02129 StringValueCStr(dumped),
02130 ec->last_error.source_encoding,
02131 ec->last_error.destination_encoding);
02132 }
02133 else {
02134 int i;
02135 mesg = rb_sprintf("%s to %s in conversion from %s",
02136 StringValueCStr(dumped),
02137 ec->last_error.destination_encoding,
02138 ec->source_encoding_name);
02139 for (i = 0; i < ec->num_trans; i++) {
02140 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
02141 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding))
02142 rb_str_catf(mesg, " to %s",
02143 ec->elems[i].tc->transcoder->dst_encoding);
02144 }
02145 }
02146 exc = rb_exc_new3(rb_eUndefinedConversionError, mesg);
02147 idx = rb_enc_find_index(ec->last_error.source_encoding);
02148 if (0 <= idx)
02149 rb_enc_associate_index(bytes, idx);
02150 rb_ivar_set(exc, rb_intern("error_char"), bytes);
02151 goto set_encs;
02152 }
02153 return Qnil;
02154 }
02155
02156 static void
02157 more_output_buffer(
02158 VALUE destination,
02159 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02160 int max_output,
02161 unsigned char **out_start_ptr,
02162 unsigned char **out_pos,
02163 unsigned char **out_stop_ptr)
02164 {
02165 size_t len = (*out_pos - *out_start_ptr);
02166 size_t new_len = (len + max_output) * 2;
02167 *out_start_ptr = resize_destination(destination, len, new_len);
02168 *out_pos = *out_start_ptr + len;
02169 *out_stop_ptr = *out_start_ptr + new_len;
02170 }
02171
02172 static int
02173 make_replacement(rb_econv_t *ec)
02174 {
02175 rb_transcoding *tc;
02176 const rb_transcoder *tr;
02177 rb_encoding *enc;
02178 const unsigned char *replacement;
02179 const char *repl_enc;
02180 const char *ins_enc;
02181 size_t len;
02182
02183 if (ec->replacement_str)
02184 return 0;
02185
02186 ins_enc = rb_econv_encoding_to_insert_output(ec);
02187
02188 tc = ec->last_tc;
02189 if (*ins_enc) {
02190 tr = tc->transcoder;
02191 enc = rb_enc_find(tr->dst_encoding);
02192 replacement = (const unsigned char *)get_replacement_character(ins_enc, &len, &repl_enc);
02193 }
02194 else {
02195 replacement = (unsigned char *)"?";
02196 len = 1;
02197 repl_enc = "";
02198 }
02199
02200 ec->replacement_str = replacement;
02201 ec->replacement_len = len;
02202 ec->replacement_enc = repl_enc;
02203 ec->replacement_allocated = 0;
02204 return 0;
02205 }
02206
02207 int
02208 rb_econv_set_replacement(rb_econv_t *ec,
02209 const unsigned char *str, size_t len, const char *encname)
02210 {
02211 unsigned char *str2;
02212 size_t len2;
02213 const char *encname2;
02214
02215 encname2 = rb_econv_encoding_to_insert_output(ec);
02216
02217 if (encoding_equal(encname, encname2)) {
02218 str2 = xmalloc(len);
02219 MEMCPY(str2, str, unsigned char, len);
02220 len2 = len;
02221 encname2 = encname;
02222 }
02223 else {
02224 str2 = allocate_converted_string(encname, encname2, str, len, NULL, 0, &len2);
02225 if (!str2)
02226 return -1;
02227 }
02228
02229 if (ec->replacement_allocated) {
02230 xfree((void *)ec->replacement_str);
02231 }
02232 ec->replacement_allocated = 1;
02233 ec->replacement_str = str2;
02234 ec->replacement_len = len2;
02235 ec->replacement_enc = encname2;
02236 return 0;
02237 }
02238
02239 static int
02240 output_replacement_character(rb_econv_t *ec)
02241 {
02242 int ret;
02243
02244 if (make_replacement(ec) == -1)
02245 return -1;
02246
02247 ret = rb_econv_insert_output(ec, ec->replacement_str, ec->replacement_len, ec->replacement_enc);
02248 if (ret == -1)
02249 return -1;
02250
02251 return 0;
02252 }
02253
02254 #if 1
02255 #define hash_fallback rb_hash_aref
02256
02257 static VALUE
02258 proc_fallback(VALUE fallback, VALUE c)
02259 {
02260 return rb_proc_call(fallback, rb_ary_new4(1, &c));
02261 }
02262
02263 static VALUE
02264 method_fallback(VALUE fallback, VALUE c)
02265 {
02266 return rb_method_call(1, &c, fallback);
02267 }
02268
02269 static VALUE
02270 aref_fallback(VALUE fallback, VALUE c)
02271 {
02272 return rb_funcall3(fallback, sym_aref, 1, &c);
02273 }
02274
02275 static void
02276 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02277 const unsigned char *in_stop, unsigned char *out_stop,
02278 VALUE destination,
02279 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02280 const char *src_encoding,
02281 const char *dst_encoding,
02282 int ecflags,
02283 VALUE ecopts)
02284 {
02285 rb_econv_t *ec;
02286 rb_transcoding *last_tc;
02287 rb_econv_result_t ret;
02288 unsigned char *out_start = *out_pos;
02289 int max_output;
02290 VALUE exc;
02291 VALUE fallback = Qnil;
02292 VALUE (*fallback_func)(VALUE, VALUE) = 0;
02293
02294 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02295 if (!ec)
02296 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02297
02298 if (!NIL_P(ecopts) && TYPE(ecopts) == T_HASH) {
02299 fallback = rb_hash_aref(ecopts, sym_fallback);
02300 if (RB_TYPE_P(fallback, T_HASH)) {
02301 fallback_func = hash_fallback;
02302 }
02303 else if (rb_obj_is_proc(fallback)) {
02304 fallback_func = proc_fallback;
02305 }
02306 else if (rb_obj_is_method(fallback)) {
02307 fallback_func = method_fallback;
02308 }
02309 else {
02310 fallback_func = aref_fallback;
02311 }
02312 }
02313 last_tc = ec->last_tc;
02314 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02315
02316 resume:
02317 ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, 0);
02318
02319 if (!NIL_P(fallback) && ret == econv_undefined_conversion) {
02320 VALUE rep = rb_enc_str_new(
02321 (const char *)ec->last_error.error_bytes_start,
02322 ec->last_error.error_bytes_len,
02323 rb_enc_find(ec->last_error.source_encoding));
02324 rep = (*fallback_func)(fallback, rep);
02325 if (rep != Qundef && !NIL_P(rep)) {
02326 StringValue(rep);
02327 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(rep),
02328 RSTRING_LEN(rep), rb_enc_name(rb_enc_get(rep)));
02329 if ((int)ret == -1) {
02330 rb_raise(rb_eArgError, "too big fallback string");
02331 }
02332 goto resume;
02333 }
02334 }
02335
02336 if (ret == econv_invalid_byte_sequence ||
02337 ret == econv_incomplete_input ||
02338 ret == econv_undefined_conversion) {
02339 exc = make_econv_exception(ec);
02340 rb_econv_close(ec);
02341 rb_exc_raise(exc);
02342 }
02343
02344 if (ret == econv_destination_buffer_full) {
02345 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02346 goto resume;
02347 }
02348
02349 rb_econv_close(ec);
02350 return;
02351 }
02352 #else
02353
02354 static void
02355 transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
02356 const unsigned char *in_stop, unsigned char *out_stop,
02357 VALUE destination,
02358 unsigned char *(*resize_destination)(VALUE, size_t, size_t),
02359 const char *src_encoding,
02360 const char *dst_encoding,
02361 int ecflags,
02362 VALUE ecopts)
02363 {
02364 rb_econv_t *ec;
02365 rb_transcoding *last_tc;
02366 rb_econv_result_t ret;
02367 unsigned char *out_start = *out_pos;
02368 const unsigned char *ptr;
02369 int max_output;
02370 VALUE exc;
02371
02372 ec = rb_econv_open_opts(src_encoding, dst_encoding, ecflags, ecopts);
02373 if (!ec)
02374 rb_exc_raise(rb_econv_open_exc(src_encoding, dst_encoding, ecflags));
02375
02376 last_tc = ec->last_tc;
02377 max_output = last_tc ? last_tc->transcoder->max_output : 1;
02378
02379 ret = econv_source_buffer_empty;
02380 ptr = *in_pos;
02381 while (ret != econv_finished) {
02382 unsigned char input_byte;
02383 const unsigned char *p = &input_byte;
02384
02385 if (ret == econv_source_buffer_empty) {
02386 if (ptr < in_stop) {
02387 input_byte = *ptr;
02388 ret = rb_econv_convert(ec, &p, p+1, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02389 }
02390 else {
02391 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, 0);
02392 }
02393 }
02394 else {
02395 ret = rb_econv_convert(ec, NULL, NULL, out_pos, out_stop, ECONV_PARTIAL_INPUT);
02396 }
02397 if (&input_byte != p)
02398 ptr += p - &input_byte;
02399 switch (ret) {
02400 case econv_invalid_byte_sequence:
02401 case econv_incomplete_input:
02402 case econv_undefined_conversion:
02403 exc = make_econv_exception(ec);
02404 rb_econv_close(ec);
02405 rb_exc_raise(exc);
02406 break;
02407
02408 case econv_destination_buffer_full:
02409 more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
02410 break;
02411
02412 case econv_source_buffer_empty:
02413 break;
02414
02415 case econv_finished:
02416 break;
02417 }
02418 }
02419 rb_econv_close(ec);
02420 *in_pos = in_stop;
02421 return;
02422 }
02423 #endif
02424
02425
02426
02427
02428
02429
02430 static unsigned char *
02431 str_transcoding_resize(VALUE destination, size_t len, size_t new_len)
02432 {
02433 rb_str_resize(destination, new_len);
02434 return (unsigned char *)RSTRING_PTR(destination);
02435 }
02436
02437 static int
02438 econv_opts(VALUE opt, int ecflags)
02439 {
02440 VALUE v;
02441
02442 v = rb_hash_aref(opt, sym_invalid);
02443 if (NIL_P(v)) {
02444 }
02445 else if (v==sym_replace) {
02446 ecflags |= ECONV_INVALID_REPLACE;
02447 }
02448 else {
02449 rb_raise(rb_eArgError, "unknown value for invalid character option");
02450 }
02451
02452 v = rb_hash_aref(opt, sym_undef);
02453 if (NIL_P(v)) {
02454 }
02455 else if (v==sym_replace) {
02456 ecflags |= ECONV_UNDEF_REPLACE;
02457 }
02458 else {
02459 rb_raise(rb_eArgError, "unknown value for undefined character option");
02460 }
02461
02462 v = rb_hash_aref(opt, sym_replace);
02463 if (!NIL_P(v) && !(ecflags & ECONV_INVALID_REPLACE)) {
02464 ecflags |= ECONV_UNDEF_REPLACE;
02465 }
02466
02467 v = rb_hash_aref(opt, sym_xml);
02468 if (!NIL_P(v)) {
02469 if (v==sym_text) {
02470 ecflags |= ECONV_XML_TEXT_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02471 }
02472 else if (v==sym_attr) {
02473 ecflags |= ECONV_XML_ATTR_CONTENT_DECORATOR|ECONV_XML_ATTR_QUOTE_DECORATOR|ECONV_UNDEF_HEX_CHARREF;
02474 }
02475 else if (TYPE(v) == T_SYMBOL) {
02476 rb_raise(rb_eArgError, "unexpected value for xml option: %s", rb_id2name(SYM2ID(v)));
02477 }
02478 else {
02479 rb_raise(rb_eArgError, "unexpected value for xml option");
02480 }
02481 }
02482
02483 #ifdef ENABLE_ECONV_NEWLINE_OPTION
02484 v = rb_hash_aref(opt, sym_newline);
02485 if (!NIL_P(v)) {
02486 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02487 if (v == sym_universal) {
02488 ecflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02489 }
02490 else if (v == sym_crlf) {
02491 ecflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02492 }
02493 else if (v == sym_cr) {
02494 ecflags |= ECONV_CR_NEWLINE_DECORATOR;
02495 }
02496 else if (v == sym_lf) {
02497
02498 }
02499 else if (SYMBOL_P(v)) {
02500 rb_raise(rb_eArgError, "unexpected value for newline option: %s",
02501 rb_id2name(SYM2ID(v)));
02502 }
02503 else {
02504 rb_raise(rb_eArgError, "unexpected value for newline option");
02505 }
02506 }
02507 else
02508 #endif
02509 {
02510 int setflags = 0, newlineflag = 0;
02511
02512 v = rb_hash_aref(opt, sym_universal_newline);
02513 if (RTEST(v))
02514 setflags |= ECONV_UNIVERSAL_NEWLINE_DECORATOR;
02515 newlineflag |= !NIL_P(v);
02516
02517 v = rb_hash_aref(opt, sym_crlf_newline);
02518 if (RTEST(v))
02519 setflags |= ECONV_CRLF_NEWLINE_DECORATOR;
02520 newlineflag |= !NIL_P(v);
02521
02522 v = rb_hash_aref(opt, sym_cr_newline);
02523 if (RTEST(v))
02524 setflags |= ECONV_CR_NEWLINE_DECORATOR;
02525 newlineflag |= !NIL_P(v);
02526
02527 if (newlineflag) {
02528 ecflags &= ~ECONV_NEWLINE_DECORATOR_MASK;
02529 ecflags |= setflags;
02530 }
02531 }
02532
02533 return ecflags;
02534 }
02535
02536 int
02537 rb_econv_prepare_options(VALUE opthash, VALUE *opts, int ecflags)
02538 {
02539 VALUE newhash = Qnil;
02540 VALUE v;
02541
02542 if (NIL_P(opthash)) {
02543 *opts = Qnil;
02544 return ecflags;
02545 }
02546 ecflags = econv_opts(opthash, ecflags);
02547
02548 v = rb_hash_aref(opthash, sym_replace);
02549 if (!NIL_P(v)) {
02550 StringValue(v);
02551 if (rb_enc_str_coderange(v) == ENC_CODERANGE_BROKEN) {
02552 VALUE dumped = rb_str_dump(v);
02553 rb_raise(rb_eArgError, "replacement string is broken: %s as %s",
02554 StringValueCStr(dumped),
02555 rb_enc_name(rb_enc_get(v)));
02556 }
02557 v = rb_str_new_frozen(v);
02558 newhash = rb_hash_new();
02559 rb_hash_aset(newhash, sym_replace, v);
02560 }
02561
02562 v = rb_hash_aref(opthash, sym_fallback);
02563 if (!NIL_P(v)) {
02564 VALUE h = rb_check_hash_type(v);
02565 if (NIL_P(h)
02566 ? (rb_obj_is_proc(v) || rb_obj_is_method(v) || rb_respond_to(v, sym_aref))
02567 : (v = h, 1)) {
02568 if (NIL_P(newhash))
02569 newhash = rb_hash_new();
02570 rb_hash_aset(newhash, sym_fallback, v);
02571 }
02572 }
02573
02574 if (!NIL_P(newhash))
02575 rb_hash_freeze(newhash);
02576 *opts = newhash;
02577
02578 return ecflags;
02579 }
02580
02581 int
02582 rb_econv_prepare_opts(VALUE opthash, VALUE *opts)
02583 {
02584 return rb_econv_prepare_options(opthash, opts, 0);
02585 }
02586
02587 rb_econv_t *
02588 rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE opthash)
02589 {
02590 rb_econv_t *ec;
02591 VALUE replacement;
02592
02593 if (NIL_P(opthash)) {
02594 replacement = Qnil;
02595 }
02596 else {
02597 if (TYPE(opthash) != T_HASH || !OBJ_FROZEN(opthash))
02598 rb_bug("rb_econv_open_opts called with invalid opthash");
02599 replacement = rb_hash_aref(opthash, sym_replace);
02600 }
02601
02602 ec = rb_econv_open(source_encoding, destination_encoding, ecflags);
02603 if (!ec)
02604 return ec;
02605
02606 if (!NIL_P(replacement)) {
02607 int ret;
02608 rb_encoding *enc = rb_enc_get(replacement);
02609
02610 ret = rb_econv_set_replacement(ec,
02611 (const unsigned char *)RSTRING_PTR(replacement),
02612 RSTRING_LEN(replacement),
02613 rb_enc_name(enc));
02614 if (ret == -1) {
02615 rb_econv_close(ec);
02616 return NULL;
02617 }
02618 }
02619 return ec;
02620 }
02621
02622 static int
02623 enc_arg(volatile VALUE *arg, const char **name_p, rb_encoding **enc_p)
02624 {
02625 rb_encoding *enc;
02626 const char *n;
02627 int encidx;
02628 VALUE encval;
02629
02630 if (((encidx = rb_to_encoding_index(encval = *arg)) < 0) ||
02631 !(enc = rb_enc_from_index(encidx))) {
02632 enc = NULL;
02633 encidx = 0;
02634 n = StringValueCStr(*arg);
02635 }
02636 else {
02637 n = rb_enc_name(enc);
02638 }
02639
02640 *name_p = n;
02641 *enc_p = enc;
02642
02643 return encidx;
02644 }
02645
02646 static int
02647 str_transcode_enc_args(VALUE str, volatile VALUE *arg1, volatile VALUE *arg2,
02648 const char **sname_p, rb_encoding **senc_p,
02649 const char **dname_p, rb_encoding **denc_p)
02650 {
02651 rb_encoding *senc, *denc;
02652 const char *sname, *dname;
02653 int sencidx, dencidx;
02654
02655 dencidx = enc_arg(arg1, &dname, &denc);
02656
02657 if (NIL_P(*arg2)) {
02658 sencidx = rb_enc_get_index(str);
02659 senc = rb_enc_from_index(sencidx);
02660 sname = rb_enc_name(senc);
02661 }
02662 else {
02663 sencidx = enc_arg(arg2, &sname, &senc);
02664 }
02665
02666 *sname_p = sname;
02667 *senc_p = senc;
02668 *dname_p = dname;
02669 *denc_p = denc;
02670 return dencidx;
02671 }
02672
02673 static int
02674 str_transcode0(int argc, VALUE *argv, VALUE *self, int ecflags, VALUE ecopts)
02675 {
02676 VALUE dest;
02677 VALUE str = *self;
02678 volatile VALUE arg1, arg2;
02679 long blen, slen;
02680 unsigned char *buf, *bp, *sp;
02681 const unsigned char *fromp;
02682 rb_encoding *senc, *denc;
02683 const char *sname, *dname;
02684 int dencidx;
02685
02686 if (argc <0 || argc > 2) {
02687 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
02688 }
02689
02690 if (argc == 0) {
02691 arg1 = rb_enc_default_internal();
02692 if (NIL_P(arg1)) {
02693 if (!ecflags) return -1;
02694 arg1 = rb_obj_encoding(str);
02695 }
02696 ecflags |= ECONV_INVALID_REPLACE | ECONV_UNDEF_REPLACE;
02697 }
02698 else {
02699 arg1 = argv[0];
02700 }
02701 arg2 = argc<=1 ? Qnil : argv[1];
02702 dencidx = str_transcode_enc_args(str, &arg1, &arg2, &sname, &senc, &dname, &denc);
02703
02704 if ((ecflags & (ECONV_NEWLINE_DECORATOR_MASK|
02705 ECONV_XML_TEXT_DECORATOR|
02706 ECONV_XML_ATTR_CONTENT_DECORATOR|
02707 ECONV_XML_ATTR_QUOTE_DECORATOR)) == 0) {
02708 if (senc && senc == denc) {
02709 return NIL_P(arg2) ? -1 : dencidx;
02710 }
02711 if (senc && denc && rb_enc_asciicompat(senc) && rb_enc_asciicompat(denc)) {
02712 if (rb_enc_str_coderange(str) == ENC_CODERANGE_7BIT) {
02713 return dencidx;
02714 }
02715 }
02716 if (encoding_equal(sname, dname)) {
02717 return NIL_P(arg2) ? -1 : dencidx;
02718 }
02719 }
02720 else {
02721 if (encoding_equal(sname, dname)) {
02722 sname = "";
02723 dname = "";
02724 }
02725 }
02726
02727 fromp = sp = (unsigned char *)RSTRING_PTR(str);
02728 slen = RSTRING_LEN(str);
02729 blen = slen + 30;
02730 dest = rb_str_tmp_new(blen);
02731 bp = (unsigned char *)RSTRING_PTR(dest);
02732
02733 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, sname, dname, ecflags, ecopts);
02734 if (fromp != sp+slen) {
02735 rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
02736 }
02737 buf = (unsigned char *)RSTRING_PTR(dest);
02738 *bp = '\0';
02739 rb_str_set_len(dest, bp - buf);
02740
02741
02742 if (!denc) {
02743 dencidx = rb_define_dummy_encoding(dname);
02744 }
02745 *self = dest;
02746
02747 return dencidx;
02748 }
02749
02750 static int
02751 str_transcode(int argc, VALUE *argv, VALUE *self)
02752 {
02753 VALUE opt;
02754 int ecflags = 0;
02755 VALUE ecopts = Qnil;
02756
02757 argc = rb_scan_args(argc, argv, "02:", NULL, NULL, &opt);
02758 if (!NIL_P(opt)) {
02759 ecflags = rb_econv_prepare_opts(opt, &ecopts);
02760 }
02761 return str_transcode0(argc, argv, self, ecflags, ecopts);
02762 }
02763
02764 static inline VALUE
02765 str_encode_associate(VALUE str, int encidx)
02766 {
02767 int cr = 0;
02768
02769 rb_enc_associate_index(str, encidx);
02770
02771
02772 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
02773 rb_str_coderange_scan_restartable(RSTRING_PTR(str), RSTRING_END(str), 0, &cr);
02774 }
02775 else {
02776 cr = ENC_CODERANGE_VALID;
02777 }
02778 ENC_CODERANGE_SET(str, cr);
02779 return str;
02780 }
02781
02782
02783
02784
02785
02786
02787
02788
02789
02790
02791
02792
02793
02794
02795
02796 static VALUE
02797 str_encode_bang(int argc, VALUE *argv, VALUE str)
02798 {
02799 VALUE newstr;
02800 int encidx;
02801
02802 rb_check_frozen(str);
02803
02804 newstr = str;
02805 encidx = str_transcode(argc, argv, &newstr);
02806
02807 if (encidx < 0) return str;
02808 rb_str_shared_replace(str, newstr);
02809 return str_encode_associate(str, encidx);
02810 }
02811
02812 static VALUE encoded_dup(VALUE newstr, VALUE str, int encidx);
02813
02814
02815
02816
02817
02818
02819
02820
02821
02822
02823
02824
02825
02826
02827
02828
02829
02830
02831
02832
02833
02834
02835
02836
02837
02838
02839
02840
02841
02842
02843
02844
02845
02846
02847
02848
02849
02850
02851
02852
02853
02854
02855
02856
02857
02858
02859
02860
02861
02862
02863
02864
02865
02866
02867
02868
02869
02870 static VALUE
02871 str_encode(int argc, VALUE *argv, VALUE str)
02872 {
02873 VALUE newstr = str;
02874 int encidx = str_transcode(argc, argv, &newstr);
02875 return encoded_dup(newstr, str, encidx);
02876 }
02877
02878 VALUE
02879 rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
02880 {
02881 int argc = 1;
02882 VALUE *argv = &to;
02883 VALUE newstr = str;
02884 int encidx = str_transcode0(argc, argv, &newstr, ecflags, ecopts);
02885 return encoded_dup(newstr, str, encidx);
02886 }
02887
02888 static VALUE
02889 encoded_dup(VALUE newstr, VALUE str, int encidx)
02890 {
02891 if (encidx < 0) return rb_str_dup(str);
02892 if (newstr == str) {
02893 newstr = rb_str_dup(str);
02894 }
02895 else {
02896 RBASIC(newstr)->klass = rb_obj_class(str);
02897 }
02898 return str_encode_associate(newstr, encidx);
02899 }
02900
02901 static void
02902 econv_free(void *ptr)
02903 {
02904 rb_econv_t *ec = ptr;
02905 rb_econv_close(ec);
02906 }
02907
02908 static size_t
02909 econv_memsize(const void *ptr)
02910 {
02911 return ptr ? sizeof(rb_econv_t) : 0;
02912 }
02913
02914 static const rb_data_type_t econv_data_type = {
02915 "econv",
02916 {NULL, econv_free, econv_memsize,},
02917 };
02918
02919 static VALUE
02920 econv_s_allocate(VALUE klass)
02921 {
02922 return TypedData_Wrap_Struct(klass, &econv_data_type, NULL);
02923 }
02924
02925 static rb_encoding *
02926 make_dummy_encoding(const char *name)
02927 {
02928 rb_encoding *enc;
02929 int idx;
02930 idx = rb_define_dummy_encoding(name);
02931 enc = rb_enc_from_index(idx);
02932 return enc;
02933 }
02934
02935 static rb_encoding *
02936 make_encoding(const char *name)
02937 {
02938 rb_encoding *enc;
02939 enc = rb_enc_find(name);
02940 if (!enc)
02941 enc = make_dummy_encoding(name);
02942 return enc;
02943 }
02944
02945 static VALUE
02946 make_encobj(const char *name)
02947 {
02948 return rb_enc_from_encoding(make_encoding(name));
02949 }
02950
02951
02952
02953
02954
02955
02956
02957
02958
02959
02960
02961
02962
02963
02964
02965
02966
02967
02968
02969 static VALUE
02970 econv_s_asciicompat_encoding(VALUE klass, VALUE arg)
02971 {
02972 const char *arg_name, *result_name;
02973 rb_encoding *arg_enc, *result_enc;
02974
02975 enc_arg(&arg, &arg_name, &arg_enc);
02976
02977 result_name = rb_econv_asciicompat_encoding(arg_name);
02978
02979 if (result_name == NULL)
02980 return Qnil;
02981
02982 result_enc = make_encoding(result_name);
02983
02984 return rb_enc_from_encoding(result_enc);
02985 }
02986
02987 static void
02988 econv_args(int argc, VALUE *argv,
02989 volatile VALUE *snamev_p, volatile VALUE *dnamev_p,
02990 const char **sname_p, const char **dname_p,
02991 rb_encoding **senc_p, rb_encoding **denc_p,
02992 int *ecflags_p,
02993 VALUE *ecopts_p)
02994 {
02995 VALUE opt, flags_v, ecopts;
02996 int sidx, didx;
02997 const char *sname, *dname;
02998 rb_encoding *senc, *denc;
02999 int ecflags;
03000
03001 argc = rb_scan_args(argc, argv, "21:", snamev_p, dnamev_p, &flags_v, &opt);
03002
03003 if (!NIL_P(flags_v)) {
03004 if (!NIL_P(opt)) {
03005 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..3)",
03006 argc + 1);
03007 }
03008 ecflags = NUM2INT(rb_to_int(flags_v));
03009 ecopts = Qnil;
03010 }
03011 else if (!NIL_P(opt)) {
03012 ecflags = rb_econv_prepare_opts(opt, &ecopts);
03013 }
03014 else {
03015 ecflags = 0;
03016 ecopts = Qnil;
03017 }
03018
03019 senc = NULL;
03020 sidx = rb_to_encoding_index(*snamev_p);
03021 if (0 <= sidx) {
03022 senc = rb_enc_from_index(sidx);
03023 }
03024 else {
03025 StringValue(*snamev_p);
03026 }
03027
03028 denc = NULL;
03029 didx = rb_to_encoding_index(*dnamev_p);
03030 if (0 <= didx) {
03031 denc = rb_enc_from_index(didx);
03032 }
03033 else {
03034 StringValue(*dnamev_p);
03035 }
03036
03037 sname = senc ? rb_enc_name(senc) : StringValueCStr(*snamev_p);
03038 dname = denc ? rb_enc_name(denc) : StringValueCStr(*dnamev_p);
03039
03040 *sname_p = sname;
03041 *dname_p = dname;
03042 *senc_p = senc;
03043 *denc_p = denc;
03044 *ecflags_p = ecflags;
03045 *ecopts_p = ecopts;
03046 }
03047
03048 static int
03049 decorate_convpath(VALUE convpath, int ecflags)
03050 {
03051 int num_decorators;
03052 const char *decorators[MAX_ECFLAGS_DECORATORS];
03053 int i;
03054 int n, len;
03055
03056 num_decorators = decorator_names(ecflags, decorators);
03057 if (num_decorators == -1)
03058 return -1;
03059
03060 len = n = RARRAY_LENINT(convpath);
03061 if (n != 0) {
03062 VALUE pair = RARRAY_PTR(convpath)[n-1];
03063 if (TYPE(pair) == T_ARRAY) {
03064 const char *sname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[0]));
03065 const char *dname = rb_enc_name(rb_to_encoding(RARRAY_PTR(pair)[1]));
03066 transcoder_entry_t *entry = get_transcoder_entry(sname, dname);
03067 const rb_transcoder *tr = load_transcoder_entry(entry);
03068 if (!tr)
03069 return -1;
03070 if (!DECORATOR_P(tr->src_encoding, tr->dst_encoding) &&
03071 tr->asciicompat_type == asciicompat_encoder) {
03072 n--;
03073 rb_ary_store(convpath, len + num_decorators - 1, pair);
03074 }
03075 }
03076 else {
03077 rb_ary_store(convpath, len + num_decorators - 1, pair);
03078 }
03079 }
03080
03081 for (i = 0; i < num_decorators; i++)
03082 rb_ary_store(convpath, n + i, rb_str_new_cstr(decorators[i]));
03083
03084 return 0;
03085 }
03086
03087 static void
03088 search_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03089 {
03090 VALUE *ary_p = arg;
03091 VALUE v;
03092
03093 if (*ary_p == Qnil) {
03094 *ary_p = rb_ary_new();
03095 }
03096
03097 if (DECORATOR_P(sname, dname)) {
03098 v = rb_str_new_cstr(dname);
03099 }
03100 else {
03101 v = rb_assoc_new(make_encobj(sname), make_encobj(dname));
03102 }
03103 rb_ary_store(*ary_p, depth, v);
03104 }
03105
03106
03107
03108
03109
03110
03111
03112
03113
03114
03115
03116
03117
03118
03119
03120
03121
03122
03123
03124
03125
03126
03127
03128
03129
03130
03131 static VALUE
03132 econv_s_search_convpath(int argc, VALUE *argv, VALUE klass)
03133 {
03134 volatile VALUE snamev, dnamev;
03135 const char *sname, *dname;
03136 rb_encoding *senc, *denc;
03137 int ecflags;
03138 VALUE ecopts;
03139 VALUE convpath;
03140
03141 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03142
03143 convpath = Qnil;
03144 transcode_search_path(sname, dname, search_convpath_i, &convpath);
03145
03146 if (NIL_P(convpath))
03147 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03148
03149 if (decorate_convpath(convpath, ecflags) == -1)
03150 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03151
03152 return convpath;
03153 }
03154
03155
03156
03157
03158
03159
03160 int
03161 rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding)
03162 {
03163 VALUE convpath = Qnil;
03164 transcode_search_path(from_encoding, to_encoding, search_convpath_i,
03165 &convpath);
03166 return RTEST(convpath);
03167 }
03168
03169 struct rb_econv_init_by_convpath_t {
03170 rb_econv_t *ec;
03171 int index;
03172 int ret;
03173 };
03174
03175 static void
03176 rb_econv_init_by_convpath_i(const char *sname, const char *dname, int depth, void *arg)
03177 {
03178 struct rb_econv_init_by_convpath_t *a = (struct rb_econv_init_by_convpath_t *)arg;
03179 int ret;
03180
03181 if (a->ret == -1)
03182 return;
03183
03184 ret = rb_econv_add_converter(a->ec, sname, dname, a->index);
03185
03186 a->ret = ret;
03187 return;
03188 }
03189
03190 static rb_econv_t *
03191 rb_econv_init_by_convpath(VALUE self, VALUE convpath,
03192 const char **sname_p, const char **dname_p,
03193 rb_encoding **senc_p, rb_encoding**denc_p)
03194 {
03195 rb_econv_t *ec;
03196 long i;
03197 int ret, first=1;
03198 VALUE elt;
03199 rb_encoding *senc = 0, *denc = 0;
03200 const char *sname, *dname;
03201
03202 ec = rb_econv_alloc(RARRAY_LENINT(convpath));
03203 DATA_PTR(self) = ec;
03204
03205 for (i = 0; i < RARRAY_LEN(convpath); i++) {
03206 volatile VALUE snamev, dnamev;
03207 VALUE pair;
03208 elt = rb_ary_entry(convpath, i);
03209 if (!NIL_P(pair = rb_check_array_type(elt))) {
03210 if (RARRAY_LEN(pair) != 2)
03211 rb_raise(rb_eArgError, "not a 2-element array in convpath");
03212 snamev = rb_ary_entry(pair, 0);
03213 enc_arg(&snamev, &sname, &senc);
03214 dnamev = rb_ary_entry(pair, 1);
03215 enc_arg(&dnamev, &dname, &denc);
03216 }
03217 else {
03218 sname = "";
03219 dname = StringValueCStr(elt);
03220 }
03221 if (DECORATOR_P(sname, dname)) {
03222 ret = rb_econv_add_converter(ec, sname, dname, ec->num_trans);
03223 if (ret == -1)
03224 rb_raise(rb_eArgError, "decoration failed: %s", dname);
03225 }
03226 else {
03227 int j = ec->num_trans;
03228 struct rb_econv_init_by_convpath_t arg;
03229 arg.ec = ec;
03230 arg.index = ec->num_trans;
03231 arg.ret = 0;
03232 ret = transcode_search_path(sname, dname, rb_econv_init_by_convpath_i, &arg);
03233 if (ret == -1 || arg.ret == -1)
03234 rb_raise(rb_eArgError, "adding conversion failed: %s to %s", sname, dname);
03235 if (first) {
03236 first = 0;
03237 *senc_p = senc;
03238 *sname_p = ec->elems[j].tc->transcoder->src_encoding;
03239 }
03240 *denc_p = denc;
03241 *dname_p = ec->elems[ec->num_trans-1].tc->transcoder->dst_encoding;
03242 }
03243 }
03244
03245 if (first) {
03246 *senc_p = NULL;
03247 *denc_p = NULL;
03248 *sname_p = "";
03249 *dname_p = "";
03250 }
03251
03252 ec->source_encoding_name = *sname_p;
03253 ec->destination_encoding_name = *dname_p;
03254
03255 return ec;
03256 }
03257
03258
03259
03260
03261
03262
03263
03264
03265
03266
03267
03268
03269
03270
03271
03272
03273
03274
03275
03276
03277
03278
03279
03280
03281
03282
03283
03284
03285
03286
03287
03288
03289
03290
03291
03292
03293
03294
03295
03296
03297
03298
03299
03300
03301
03302
03303
03304
03305
03306
03307
03308
03309
03310
03311
03312
03313
03314
03315
03316
03317
03318
03319
03320
03321
03322
03323
03324
03325
03326
03327
03328
03329
03330
03331
03332
03333
03334
03335
03336
03337
03338
03339
03340
03341
03342
03343
03344
03345
03346
03347
03348
03349
03350
03351
03352
03353
03354
03355
03356
03357
03358
03359
03360
03361
03362
03363
03364 static VALUE
03365 econv_init(int argc, VALUE *argv, VALUE self)
03366 {
03367 VALUE ecopts;
03368 volatile VALUE snamev, dnamev;
03369 const char *sname, *dname;
03370 rb_encoding *senc, *denc;
03371 rb_econv_t *ec;
03372 int ecflags;
03373 VALUE convpath;
03374
03375 if (rb_check_typeddata(self, &econv_data_type)) {
03376 rb_raise(rb_eTypeError, "already initialized");
03377 }
03378
03379 if (argc == 1 && !NIL_P(convpath = rb_check_array_type(argv[0]))) {
03380 ec = rb_econv_init_by_convpath(self, convpath, &sname, &dname, &senc, &denc);
03381 ecflags = 0;
03382 ecopts = Qnil;
03383 }
03384 else {
03385 econv_args(argc, argv, &snamev, &dnamev, &sname, &dname, &senc, &denc, &ecflags, &ecopts);
03386 ec = rb_econv_open_opts(sname, dname, ecflags, ecopts);
03387 }
03388
03389 if (!ec) {
03390 rb_exc_raise(rb_econv_open_exc(sname, dname, ecflags));
03391 }
03392
03393 if (!DECORATOR_P(sname, dname)) {
03394 if (!senc)
03395 senc = make_dummy_encoding(sname);
03396 if (!denc)
03397 denc = make_dummy_encoding(dname);
03398 }
03399
03400 ec->source_encoding = senc;
03401 ec->destination_encoding = denc;
03402
03403 DATA_PTR(self) = ec;
03404
03405 return self;
03406 }
03407
03408
03409
03410
03411
03412
03413
03414
03415
03416
03417
03418 static VALUE
03419 econv_inspect(VALUE self)
03420 {
03421 const char *cname = rb_obj_classname(self);
03422 rb_econv_t *ec;
03423
03424 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03425 if (!ec)
03426 return rb_sprintf("#<%s: uninitialized>", cname);
03427 else {
03428 const char *sname = ec->source_encoding_name;
03429 const char *dname = ec->destination_encoding_name;
03430 VALUE str;
03431 str = rb_sprintf("#<%s: ", cname);
03432 econv_description(sname, dname, ec->flags, str);
03433 rb_str_cat2(str, ">");
03434 return str;
03435 }
03436 }
03437
03438 static rb_econv_t *
03439 check_econv(VALUE self)
03440 {
03441 rb_econv_t *ec;
03442
03443 TypedData_Get_Struct(self, rb_econv_t, &econv_data_type, ec);
03444 if (!ec) {
03445 rb_raise(rb_eTypeError, "uninitialized encoding converter");
03446 }
03447 return ec;
03448 }
03449
03450
03451
03452
03453
03454
03455
03456 static VALUE
03457 econv_source_encoding(VALUE self)
03458 {
03459 rb_econv_t *ec = check_econv(self);
03460 if (!ec->source_encoding)
03461 return Qnil;
03462 return rb_enc_from_encoding(ec->source_encoding);
03463 }
03464
03465
03466
03467
03468
03469
03470
03471 static VALUE
03472 econv_destination_encoding(VALUE self)
03473 {
03474 rb_econv_t *ec = check_econv(self);
03475 if (!ec->destination_encoding)
03476 return Qnil;
03477 return rb_enc_from_encoding(ec->destination_encoding);
03478 }
03479
03480
03481
03482
03483
03484
03485
03486
03487
03488
03489
03490
03491
03492
03493
03494
03495
03496
03497
03498
03499
03500
03501
03502 static VALUE
03503 econv_convpath(VALUE self)
03504 {
03505 rb_econv_t *ec = check_econv(self);
03506 VALUE result;
03507 int i;
03508
03509 result = rb_ary_new();
03510 for (i = 0; i < ec->num_trans; i++) {
03511 const rb_transcoder *tr = ec->elems[i].tc->transcoder;
03512 VALUE v;
03513 if (DECORATOR_P(tr->src_encoding, tr->dst_encoding))
03514 v = rb_str_new_cstr(tr->dst_encoding);
03515 else
03516 v = rb_assoc_new(make_encobj(tr->src_encoding), make_encobj(tr->dst_encoding));
03517 rb_ary_push(result, v);
03518 }
03519 return result;
03520 }
03521
03522
03523
03524
03525
03526 static VALUE
03527 econv_equal(VALUE self, VALUE other)
03528 {
03529 rb_econv_t *ec1 = check_econv(self);
03530 rb_econv_t *ec2;
03531 int i;
03532
03533 if (!rb_typeddata_is_kind_of(other, &econv_data_type)) {
03534 return Qnil;
03535 }
03536 ec2 = DATA_PTR(other);
03537 if (!ec2) return Qfalse;
03538 if (ec1->source_encoding_name != ec2->source_encoding_name &&
03539 strcmp(ec1->source_encoding_name, ec2->source_encoding_name))
03540 return Qfalse;
03541 if (ec1->destination_encoding_name != ec2->destination_encoding_name &&
03542 strcmp(ec1->destination_encoding_name, ec2->destination_encoding_name))
03543 return Qfalse;
03544 if (ec1->flags != ec2->flags) return Qfalse;
03545 if (ec1->replacement_enc != ec2->replacement_enc &&
03546 strcmp(ec1->replacement_enc, ec2->replacement_enc))
03547 return Qfalse;
03548 if (ec1->replacement_len != ec2->replacement_len) return Qfalse;
03549 if (ec1->replacement_str != ec2->replacement_str &&
03550 memcmp(ec1->replacement_str, ec2->replacement_str, ec2->replacement_len))
03551 return Qfalse;
03552
03553 if (ec1->num_trans != ec2->num_trans) return Qfalse;
03554 for (i = 0; i < ec1->num_trans; i++) {
03555 if (ec1->elems[i].tc->transcoder != ec2->elems[i].tc->transcoder)
03556 return Qfalse;
03557 }
03558 return Qtrue;
03559 }
03560
03561 static VALUE
03562 econv_result_to_symbol(rb_econv_result_t res)
03563 {
03564 switch (res) {
03565 case econv_invalid_byte_sequence: return sym_invalid_byte_sequence;
03566 case econv_incomplete_input: return sym_incomplete_input;
03567 case econv_undefined_conversion: return sym_undefined_conversion;
03568 case econv_destination_buffer_full: return sym_destination_buffer_full;
03569 case econv_source_buffer_empty: return sym_source_buffer_empty;
03570 case econv_finished: return sym_finished;
03571 case econv_after_output: return sym_after_output;
03572 default: return INT2NUM(res);
03573 }
03574 }
03575
03576
03577
03578
03579
03580
03581
03582
03583
03584
03585
03586
03587
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597
03598
03599
03600
03601
03602
03603
03604
03605
03606
03607
03608
03609
03610
03611
03612
03613
03614
03615
03616
03617
03618
03619
03620
03621
03622
03623
03624
03625
03626
03627
03628
03629
03630
03631
03632
03633
03634
03635
03636
03637
03638
03639
03640
03641
03642
03643
03644
03645
03646
03647
03648
03649
03650
03651
03652
03653
03654
03655
03656
03657
03658
03659
03660
03661
03662
03663
03664
03665
03666
03667 static VALUE
03668 econv_primitive_convert(int argc, VALUE *argv, VALUE self)
03669 {
03670 VALUE input, output, output_byteoffset_v, output_bytesize_v, opt, flags_v;
03671 rb_econv_t *ec = check_econv(self);
03672 rb_econv_result_t res;
03673 const unsigned char *ip, *is;
03674 unsigned char *op, *os;
03675 long output_byteoffset, output_bytesize;
03676 unsigned long output_byteend;
03677 int flags;
03678
03679 argc = rb_scan_args(argc, argv, "23:", &input, &output, &output_byteoffset_v, &output_bytesize_v, &flags_v, &opt);
03680
03681 if (NIL_P(output_byteoffset_v))
03682 output_byteoffset = 0;
03683 else
03684 output_byteoffset = NUM2LONG(output_byteoffset_v);
03685
03686 if (NIL_P(output_bytesize_v))
03687 output_bytesize = 0;
03688 else
03689 output_bytesize = NUM2LONG(output_bytesize_v);
03690
03691 if (!NIL_P(flags_v)) {
03692 if (!NIL_P(opt)) {
03693 rb_raise(rb_eArgError, "wrong number of arguments (%d for 2..5)",
03694 argc + 1);
03695 }
03696 flags = NUM2INT(rb_to_int(flags_v));
03697 }
03698 else if (!NIL_P(opt)) {
03699 VALUE v;
03700 flags = 0;
03701 v = rb_hash_aref(opt, sym_partial_input);
03702 if (RTEST(v))
03703 flags |= ECONV_PARTIAL_INPUT;
03704 v = rb_hash_aref(opt, sym_after_output);
03705 if (RTEST(v))
03706 flags |= ECONV_AFTER_OUTPUT;
03707 }
03708 else {
03709 flags = 0;
03710 }
03711
03712 StringValue(output);
03713 if (!NIL_P(input))
03714 StringValue(input);
03715 rb_str_modify(output);
03716
03717 if (NIL_P(output_bytesize_v)) {
03718 output_bytesize = RSTRING_EMBED_LEN_MAX;
03719 if (!NIL_P(input) && output_bytesize < RSTRING_LEN(input))
03720 output_bytesize = RSTRING_LEN(input);
03721 }
03722
03723 retry:
03724
03725 if (NIL_P(output_byteoffset_v))
03726 output_byteoffset = RSTRING_LEN(output);
03727
03728 if (output_byteoffset < 0)
03729 rb_raise(rb_eArgError, "negative output_byteoffset");
03730
03731 if (RSTRING_LEN(output) < output_byteoffset)
03732 rb_raise(rb_eArgError, "output_byteoffset too big");
03733
03734 if (output_bytesize < 0)
03735 rb_raise(rb_eArgError, "negative output_bytesize");
03736
03737 output_byteend = (unsigned long)output_byteoffset +
03738 (unsigned long)output_bytesize;
03739
03740 if (output_byteend < (unsigned long)output_byteoffset ||
03741 LONG_MAX < output_byteend)
03742 rb_raise(rb_eArgError, "output_byteoffset+output_bytesize too big");
03743
03744 if (rb_str_capacity(output) < output_byteend)
03745 rb_str_resize(output, output_byteend);
03746
03747 if (NIL_P(input)) {
03748 ip = is = NULL;
03749 }
03750 else {
03751 ip = (const unsigned char *)RSTRING_PTR(input);
03752 is = ip + RSTRING_LEN(input);
03753 }
03754
03755 op = (unsigned char *)RSTRING_PTR(output) + output_byteoffset;
03756 os = op + output_bytesize;
03757
03758 res = rb_econv_convert(ec, &ip, is, &op, os, flags);
03759 rb_str_set_len(output, op-(unsigned char *)RSTRING_PTR(output));
03760 if (!NIL_P(input))
03761 rb_str_drop_bytes(input, ip - (unsigned char *)RSTRING_PTR(input));
03762
03763 if (NIL_P(output_bytesize_v) && res == econv_destination_buffer_full) {
03764 if (LONG_MAX / 2 < output_bytesize)
03765 rb_raise(rb_eArgError, "too long conversion result");
03766 output_bytesize *= 2;
03767 output_byteoffset_v = Qnil;
03768 goto retry;
03769 }
03770
03771 if (ec->destination_encoding) {
03772 rb_enc_associate(output, ec->destination_encoding);
03773 }
03774
03775 return econv_result_to_symbol(res);
03776 }
03777
03778
03779
03780
03781
03782
03783
03784
03785
03786
03787
03788
03789
03790
03791
03792
03793
03794
03795
03796
03797
03798
03799
03800
03801
03802
03803
03804
03805
03806
03807
03808
03809
03810
03811
03812 static VALUE
03813 econv_convert(VALUE self, VALUE source_string)
03814 {
03815 VALUE ret, dst;
03816 VALUE av[5];
03817 int ac;
03818 rb_econv_t *ec = check_econv(self);
03819
03820 StringValue(source_string);
03821
03822 dst = rb_str_new(NULL, 0);
03823
03824 av[0] = rb_str_dup(source_string);
03825 av[1] = dst;
03826 av[2] = Qnil;
03827 av[3] = Qnil;
03828 av[4] = INT2NUM(ECONV_PARTIAL_INPUT);
03829 ac = 5;
03830
03831 ret = econv_primitive_convert(ac, av, self);
03832
03833 if (ret == sym_invalid_byte_sequence ||
03834 ret == sym_undefined_conversion ||
03835 ret == sym_incomplete_input) {
03836 VALUE exc = make_econv_exception(ec);
03837 rb_exc_raise(exc);
03838 }
03839
03840 if (ret == sym_finished) {
03841 rb_raise(rb_eArgError, "converter already finished");
03842 }
03843
03844 if (ret != sym_source_buffer_empty) {
03845 rb_bug("unexpected result of econv_primitive_convert");
03846 }
03847
03848 return dst;
03849 }
03850
03851
03852
03853
03854
03855
03856
03857
03858
03859
03860
03861
03862 static VALUE
03863 econv_finish(VALUE self)
03864 {
03865 VALUE ret, dst;
03866 VALUE av[5];
03867 int ac;
03868 rb_econv_t *ec = check_econv(self);
03869
03870 dst = rb_str_new(NULL, 0);
03871
03872 av[0] = Qnil;
03873 av[1] = dst;
03874 av[2] = Qnil;
03875 av[3] = Qnil;
03876 av[4] = INT2NUM(0);
03877 ac = 5;
03878
03879 ret = econv_primitive_convert(ac, av, self);
03880
03881 if (ret == sym_invalid_byte_sequence ||
03882 ret == sym_undefined_conversion ||
03883 ret == sym_incomplete_input) {
03884 VALUE exc = make_econv_exception(ec);
03885 rb_exc_raise(exc);
03886 }
03887
03888 if (ret != sym_finished) {
03889 rb_bug("unexpected result of econv_primitive_convert");
03890 }
03891
03892 return dst;
03893 }
03894
03895
03896
03897
03898
03899
03900
03901
03902
03903
03904
03905
03906
03907
03908
03909
03910
03911
03912
03913
03914
03915
03916
03917
03918
03919
03920
03921
03922
03923
03924
03925
03926
03927
03928
03929
03930
03931
03932
03933
03934
03935
03936
03937
03938
03939
03940
03941
03942
03943
03944
03945
03946
03947
03948
03949
03950
03951
03952
03953
03954
03955
03956
03957
03958
03959
03960
03961
03962
03963
03964
03965
03966
03967
03968
03969
03970 static VALUE
03971 econv_primitive_errinfo(VALUE self)
03972 {
03973 rb_econv_t *ec = check_econv(self);
03974
03975 VALUE ary;
03976
03977 ary = rb_ary_new2(5);
03978
03979 rb_ary_store(ary, 0, econv_result_to_symbol(ec->last_error.result));
03980 rb_ary_store(ary, 4, Qnil);
03981
03982 if (ec->last_error.source_encoding)
03983 rb_ary_store(ary, 1, rb_str_new2(ec->last_error.source_encoding));
03984
03985 if (ec->last_error.destination_encoding)
03986 rb_ary_store(ary, 2, rb_str_new2(ec->last_error.destination_encoding));
03987
03988 if (ec->last_error.error_bytes_start) {
03989 rb_ary_store(ary, 3, rb_str_new((const char *)ec->last_error.error_bytes_start, ec->last_error.error_bytes_len));
03990 rb_ary_store(ary, 4, rb_str_new((const char *)ec->last_error.error_bytes_start + ec->last_error.error_bytes_len, ec->last_error.readagain_len));
03991 }
03992
03993 return ary;
03994 }
03995
03996
03997
03998
03999
04000
04001
04002
04003
04004
04005
04006
04007
04008
04009
04010
04011
04012
04013
04014
04015
04016
04017
04018
04019
04020
04021
04022
04023
04024
04025
04026
04027
04028 static VALUE
04029 econv_insert_output(VALUE self, VALUE string)
04030 {
04031 const char *insert_enc;
04032
04033 int ret;
04034
04035 rb_econv_t *ec = check_econv(self);
04036
04037 StringValue(string);
04038 insert_enc = rb_econv_encoding_to_insert_output(ec);
04039 string = rb_str_encode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0, Qnil);
04040
04041 ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
04042 if (ret == -1) {
04043 rb_raise(rb_eArgError, "too big string");
04044 }
04045
04046 return Qnil;
04047 }
04048
04049
04050
04051
04052
04053
04054
04055
04056
04057
04058
04059
04060
04061
04062
04063
04064
04065
04066
04067
04068
04069
04070
04071
04072
04073 static VALUE
04074 econv_putback(int argc, VALUE *argv, VALUE self)
04075 {
04076 rb_econv_t *ec = check_econv(self);
04077 int n;
04078 int putbackable;
04079 VALUE str, max;
04080
04081 rb_scan_args(argc, argv, "01", &max);
04082
04083 if (NIL_P(max))
04084 n = rb_econv_putbackable(ec);
04085 else {
04086 n = NUM2INT(max);
04087 putbackable = rb_econv_putbackable(ec);
04088 if (putbackable < n)
04089 n = putbackable;
04090 }
04091
04092 str = rb_str_new(NULL, n);
04093 rb_econv_putback(ec, (unsigned char *)RSTRING_PTR(str), n);
04094
04095 if (ec->source_encoding) {
04096 rb_enc_associate(str, ec->source_encoding);
04097 }
04098
04099 return str;
04100 }
04101
04102
04103
04104
04105
04106
04107
04108
04109
04110
04111
04112
04113
04114
04115
04116
04117
04118
04119
04120
04121
04122 static VALUE
04123 econv_last_error(VALUE self)
04124 {
04125 rb_econv_t *ec = check_econv(self);
04126 VALUE exc;
04127
04128 exc = make_econv_exception(ec);
04129 if (NIL_P(exc))
04130 return Qnil;
04131 return exc;
04132 }
04133
04134
04135
04136
04137
04138
04139
04140
04141
04142
04143
04144
04145
04146 static VALUE
04147 econv_get_replacement(VALUE self)
04148 {
04149 rb_econv_t *ec = check_econv(self);
04150 int ret;
04151 rb_encoding *enc;
04152
04153 ret = make_replacement(ec);
04154 if (ret == -1) {
04155 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04156 }
04157
04158 enc = rb_enc_find(ec->replacement_enc);
04159 return rb_enc_str_new((const char *)ec->replacement_str, (long)ec->replacement_len, enc);
04160 }
04161
04162
04163
04164
04165
04166
04167
04168
04169
04170
04171
04172 static VALUE
04173 econv_set_replacement(VALUE self, VALUE arg)
04174 {
04175 rb_econv_t *ec = check_econv(self);
04176 VALUE string = arg;
04177 int ret;
04178 rb_encoding *enc;
04179
04180 StringValue(string);
04181 enc = rb_enc_get(string);
04182
04183 ret = rb_econv_set_replacement(ec,
04184 (const unsigned char *)RSTRING_PTR(string),
04185 RSTRING_LEN(string),
04186 rb_enc_name(enc));
04187
04188 if (ret == -1) {
04189
04190 rb_raise(rb_eUndefinedConversionError, "replacement character setup failed");
04191 }
04192
04193 return arg;
04194 }
04195
04196 VALUE
04197 rb_econv_make_exception(rb_econv_t *ec)
04198 {
04199 return make_econv_exception(ec);
04200 }
04201
04202 void
04203 rb_econv_check_error(rb_econv_t *ec)
04204 {
04205 VALUE exc;
04206
04207 exc = make_econv_exception(ec);
04208 if (NIL_P(exc))
04209 return;
04210 rb_exc_raise(exc);
04211 }
04212
04213
04214
04215
04216
04217
04218
04219 static VALUE
04220 ecerr_source_encoding_name(VALUE self)
04221 {
04222 return rb_attr_get(self, rb_intern("source_encoding_name"));
04223 }
04224
04225
04226
04227
04228
04229
04230
04231
04232
04233
04234
04235
04236
04237
04238
04239
04240
04241
04242
04243
04244
04245 static VALUE
04246 ecerr_source_encoding(VALUE self)
04247 {
04248 return rb_attr_get(self, rb_intern("source_encoding"));
04249 }
04250
04251
04252
04253
04254
04255
04256
04257 static VALUE
04258 ecerr_destination_encoding_name(VALUE self)
04259 {
04260 return rb_attr_get(self, rb_intern("destination_encoding_name"));
04261 }
04262
04263
04264
04265
04266
04267
04268
04269 static VALUE
04270 ecerr_destination_encoding(VALUE self)
04271 {
04272 return rb_attr_get(self, rb_intern("destination_encoding"));
04273 }
04274
04275
04276
04277
04278
04279
04280
04281
04282
04283
04284
04285
04286
04287
04288
04289
04290 static VALUE
04291 ecerr_error_char(VALUE self)
04292 {
04293 return rb_attr_get(self, rb_intern("error_char"));
04294 }
04295
04296
04297
04298
04299
04300
04301
04302
04303
04304
04305
04306
04307
04308
04309
04310
04311 static VALUE
04312 ecerr_error_bytes(VALUE self)
04313 {
04314 return rb_attr_get(self, rb_intern("error_bytes"));
04315 }
04316
04317
04318
04319
04320
04321
04322
04323 static VALUE
04324 ecerr_readagain_bytes(VALUE self)
04325 {
04326 return rb_attr_get(self, rb_intern("readagain_bytes"));
04327 }
04328
04329
04330
04331
04332
04333
04334
04335
04336
04337
04338
04339
04340
04341
04342
04343
04344
04345
04346
04347
04348
04349
04350
04351
04352
04353 static VALUE
04354 ecerr_incomplete_input(VALUE self)
04355 {
04356 return rb_attr_get(self, rb_intern("incomplete_input"));
04357 }
04358
04359
04360
04361
04362
04363
04364
04365
04366
04367
04368
04369
04370
04371
04372
04373
04374
04375
04376
04377
04378
04379
04380
04381 void
04382 Init_transcode(void)
04383 {
04384 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
04385 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
04386 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
04387
04388 transcoder_table = st_init_strcasetable();
04389
04390 sym_invalid = ID2SYM(rb_intern("invalid"));
04391 sym_undef = ID2SYM(rb_intern("undef"));
04392 sym_replace = ID2SYM(rb_intern("replace"));
04393 sym_fallback = ID2SYM(rb_intern("fallback"));
04394 sym_aref = ID2SYM(rb_intern("[]"));
04395 sym_xml = ID2SYM(rb_intern("xml"));
04396 sym_text = ID2SYM(rb_intern("text"));
04397 sym_attr = ID2SYM(rb_intern("attr"));
04398
04399 sym_invalid_byte_sequence = ID2SYM(rb_intern("invalid_byte_sequence"));
04400 sym_undefined_conversion = ID2SYM(rb_intern("undefined_conversion"));
04401 sym_destination_buffer_full = ID2SYM(rb_intern("destination_buffer_full"));
04402 sym_source_buffer_empty = ID2SYM(rb_intern("source_buffer_empty"));
04403 sym_finished = ID2SYM(rb_intern("finished"));
04404 sym_after_output = ID2SYM(rb_intern("after_output"));
04405 sym_incomplete_input = ID2SYM(rb_intern("incomplete_input"));
04406 sym_universal_newline = ID2SYM(rb_intern("universal_newline"));
04407 sym_crlf_newline = ID2SYM(rb_intern("crlf_newline"));
04408 sym_cr_newline = ID2SYM(rb_intern("cr_newline"));
04409 sym_partial_input = ID2SYM(rb_intern("partial_input"));
04410
04411 #ifdef ENABLE_ECONV_NEWLINE_OPTION
04412 sym_newline = ID2SYM(rb_intern("newline"));
04413 sym_universal = ID2SYM(rb_intern("universal"));
04414 sym_crlf = ID2SYM(rb_intern("crlf"));
04415 sym_cr = ID2SYM(rb_intern("cr"));
04416 sym_lf = ID2SYM(rb_intern("lf"));
04417 #endif
04418
04419 rb_define_method(rb_cString, "encode", str_encode, -1);
04420 rb_define_method(rb_cString, "encode!", str_encode_bang, -1);
04421
04422 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
04423 rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
04424 rb_define_singleton_method(rb_cEncodingConverter, "asciicompat_encoding", econv_s_asciicompat_encoding, 1);
04425 rb_define_singleton_method(rb_cEncodingConverter, "search_convpath", econv_s_search_convpath, -1);
04426 rb_define_method(rb_cEncodingConverter, "initialize", econv_init, -1);
04427 rb_define_method(rb_cEncodingConverter, "inspect", econv_inspect, 0);
04428 rb_define_method(rb_cEncodingConverter, "convpath", econv_convpath, 0);
04429 rb_define_method(rb_cEncodingConverter, "source_encoding", econv_source_encoding, 0);
04430 rb_define_method(rb_cEncodingConverter, "destination_encoding", econv_destination_encoding, 0);
04431 rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, -1);
04432 rb_define_method(rb_cEncodingConverter, "convert", econv_convert, 1);
04433 rb_define_method(rb_cEncodingConverter, "finish", econv_finish, 0);
04434 rb_define_method(rb_cEncodingConverter, "primitive_errinfo", econv_primitive_errinfo, 0);
04435 rb_define_method(rb_cEncodingConverter, "insert_output", econv_insert_output, 1);
04436 rb_define_method(rb_cEncodingConverter, "putback", econv_putback, -1);
04437 rb_define_method(rb_cEncodingConverter, "last_error", econv_last_error, 0);
04438 rb_define_method(rb_cEncodingConverter, "replacement", econv_get_replacement, 0);
04439 rb_define_method(rb_cEncodingConverter, "replacement=", econv_set_replacement, 1);
04440 rb_define_method(rb_cEncodingConverter, "==", econv_equal, 1);
04441
04442 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
04443 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
04444 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
04445 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
04446 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
04447 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
04448 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
04449 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
04450 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
04451 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
04452 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
04453 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
04454 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
04455
04456 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
04457 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04458 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
04459 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
04460 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
04461
04462 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
04463 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
04464 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
04465 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
04466 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
04467 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
04468 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
04469
04470 Init_newline();
04471 }
04472