view sbr/ml_codeconv.c @ 7:c20e4181370f

utf-8 input assumption in case of base64/utf-8
author kono
date Sun, 04 Dec 2005 02:30:39 +0900
parents d802748a597d
children 77780b728543
line wrap: on
line source

/* ml_codeconv.c - (multilingual) code conversion */
/*			by takada@seraph.ntt.jp   */
/*			arranged by MH-plus project */

#ifdef JAPAN

#include "../h/mh.h"
#include <ctype.h>
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>

/* coding system */
typedef int coding_system_t;
#define CS_DEFAULT	 0
#define CS_JIS7		 1
#define CS_JEUC		 2
#define CS_SJIS		 3
#define CS_UTF8		 4
#define CS_NOCONV	99

/* coding system list */
#define CSL_SIZE	3
#define CSL_DISPLAY(csl)	((csl)[0])
#define CSL_FILE(csl)		((csl)[1])
#define CSL_PROCESS(csl)	((csl)[2])

/* codeset status */
#define ASCII 		0
#define JISX0208	1

#define IS_JIS7(c)	((0x21 <= (c)) && ((c) <= 0x7e))
#define IS_JEUC(c)	((0xa1 <= (c)) && ((c) <= 0xfe))
#define IS_SJIS1(c)	(((0x81 <= (c)) && ((c) <= 0x9f)) || \
			 ((0xe0 <= (c)) && ((c) <= 0xfc)))
#define IS_SJIS2(c)	((0x40 <= (c)) && ((c) <= 0xfc) && ((c) != 0x7f))

/* encoding (mhn) */

#define CE_UNKNOWN      0x00
#define CE_BASE64       0x01
#define CE_QUOTED       0x02
#define CE_8BIT         0x03
#define CE_7BIT         0x04
#define CE_BINARY       0x05
#define CE_EXTENSION    0x06
#define CE_EXTERNAL     0x07    /* for external-body */

/* hold coding system information */
static coding_system_t ml_coding_info[CSL_SIZE] =
       { CS_DEFAULT, CS_DEFAULT, CS_DEFAULT };
int japan_environ;

/* private functions */
static void read_profile();
static coding_system_t coding_to_cs();
static coding_system_t select_coding_system();
static void ml_fputs_sbr();
static void jeuc_fputs();
static void sjis_fputs();
static void jis7_fputs();
static void utf8_fputs();
static void cntrl_putc();
static void ml_conv_sbr();

/* NKF Input/Output pointers */

static unsigned char *nkf_output;
static unsigned char *nkf_input;
static int nkf_input_ctr;
static int nkf_output_ctr;
static int nkf_limit;
static int nkf_ilimit;
static void (*nkf_flush)(unsigned char *,int);

static void nkf_open(unsigned char *opt, unsigned char *in,int ilimit,
	unsigned char *out,int limit,void (*flush)(unsigned char *,int),FILE *fp);
static void nkf_continue(unsigned char *in,int ilimit) ;
static void nkf_end() ;
static void none(unsigned char *a,int b);

static void my_flush(unsigned char *out,int out_count);
static void extend(unsigned char *out,int out_count);
static void my_pretty(unsigned char *out,int out_count);

static FILE *nkf_file;


/*
 * Initialize: holding coding system information
 */
void
ml_init()
{
    read_profile(ml_coding_info);
}

/* get coding system list from profile and environment variable */
static void
read_profile(csl)
     coding_system_t csl[];
{
    char *s, *default_coding;

#if 0 /* We won't refer $LANG nor $LC_CTYPE */
    if ((default_coding = getenv("LC_CTYPE")) == NULL)
      default_coding = getenv("LANG");
#else
    default_coding = ""; /* for CS_DEFAULT */
#endif

    if ((s = getenv("MH_DISPLAY_CODING")) == NULL)
      if ((s = m_find("display-coding")) == NULL)
	s = default_coding;
    CSL_DISPLAY(csl) = coding_to_cs(s);
    
    if ((s = getenv("MH_FILE_CODING")) == NULL)
      if ((s = m_find("file-coding")) == NULL)
	s = default_coding;
    CSL_FILE(csl) = coding_to_cs(s);

    if ((s = getenv("MH_PROCESS_CODING")) == NULL)
      if ((s = m_find("process-coding")) == NULL)
	s = default_coding;
    CSL_PROCESS(csl) = coding_to_cs(s);

    if (CSL_FILE(ml_coding_info) == CS_NOCONV)
      japan_environ = 0;
    else
      japan_environ = 1;
}

static coding_system_t
coding_to_cs(coding)
     char *coding;
{
    if (*coding == '\0')
	return CS_DEFAULT;
    else if (uleq(coding, "ja_JP.JIS7"))
	return CS_JIS7;
    else if (uleq(coding, "ja_JP.EUC"))
	return CS_JEUC;
    else if (uleq(coding, "ja_JP.EUCjp"))
	return CS_JEUC;
    else if (uleq(coding, "ja_JP.SJIS"))
	return CS_SJIS;
    else if (uleq(coding, "ja_JP.UTF-8"))
	return CS_UTF8;
    else if (uleq(coding, "C"))
	return CS_NOCONV;

    /* for backward compatibility */
    else if (uleq(coding,"japanese") || uleq(coding,"ja_JP.jis8")
	|| uleq(coding,"ja_JP.pjis") || uleq(coding,"ja_JP.jis")
	|| uleq(coding,"wr_WR.ct")   || uleq(coding,"wr_WR.junet")) {
	return(CS_JIS7);
    } else if (uleq(coding,"ja_JP.ujis")) {
	return(CS_JEUC);
    } else if (uleq(coding,"ja_JP.mscode")) {
	return(CS_SJIS);
    } else if (uleq(coding,"noconv")) {
	return(CS_NOCONV);
    } else {
	return(CS_DEFAULT);
    }
}

static coding_system_t
select_coding_system(stream)
     FILE *stream;
{
    struct stat buf;

    if (fstat(fileno(stream), &buf)) adios (NULLCP, "unable to fstat stream");
    switch (buf.st_mode & S_IFMT) {
      case S_IFREG:
	return(CSL_FILE(ml_coding_info));
      case S_IFIFO:
      case 0: /* some system returns zero-filled stat for pipe */
	return(CSL_PROCESS(ml_coding_info));
      case S_IFCHR:
      default:
	return(CSL_DISPLAY(ml_coding_info));
    }
}


/*
 *
 */
int
ml_ismlchar(c)
     unsigned char c;
{
    return japan_environ ? IS_JEUC(c) : 0;
}

int
ml_ismlptr(p)
     unsigned char *p;
{
    return japan_environ ? (IS_JEUC(*p) && IS_JEUC(*(p+1))) : 0;
}


/*
 * Output:
 */
void
ml_fputs(scanlk, stream)
     char *scanlk;
     FILE *stream;
{
    ml_fputs_sbr(scanlk, stream, 0);
}

void
ml_pretty_fputs(scanlk, stream)
     char *scanlk;
     FILE *stream;
{
    ml_fputs_sbr(scanlk, stream, 1);
}

void
junet_fputs(scanlk, stream)
     char *scanlk;
     FILE *stream;
{
    jis7_fputs(scanlk, stream, 0);
}


static void
ml_fputs_sbr(scanlk, stream, pretty)
     char *scanlk;
     FILE *stream;
     int pretty;
{
    switch (select_coding_system(stream)) {
      case CS_NOCONV:
	fputs(scanlk, stream);
	break;
      case CS_SJIS:
	sjis_fputs(scanlk, stream, pretty);
	break;
      case CS_JEUC:
	jeuc_fputs(scanlk, stream, pretty);
	break;
      case CS_UTF8:
	utf8_fputs(scanlk, stream, pretty);
	break;
      case CS_JIS7:
      case CS_DEFAULT:
      default:
	jis7_fputs(scanlk, stream, pretty);
	break;
    }
}


/*
 * Output routines with code conversion
 */

char buf[BUFSIZ];

static void
utf8_fputs(scanlk, stream, pretty)
     char *scanlk;
     FILE *stream;
     int pretty;
{
    nkf_open((unsigned char *)"-w80",(unsigned char *)scanlk,-1,(unsigned char *)buf,BUFSIZ,pretty?my_pretty: my_flush,stream);
    nkf_end();
}

static void
jeuc_fputs(scanlk, stream, pretty)
     char *scanlk;
     FILE *stream;
     int pretty;
{
    unsigned char u1, u2;

    while (u1 = *scanlk++) {
	if (IS_JEUC(u1)) {
	    u2 = *scanlk;
	    if (IS_JEUC(u2)) {
		scanlk++;
		putc(u1, stream); putc(u2, stream);
		continue;
	    }
	    putc(' ', stream);
	} else if (u1 & 0x80) {
	    putc(' ', stream);
	} else if (pretty && iscntrl(u1)) {
	    cntrl_putc(u1, stream);
	} else {
	    putc(u1, stream);
	}
    }
}

#define E2S(i1, i2, o1, o2) {\
    (i1) &= 0x7f;\
    (i2) &= 0x7f;\
    (o1) = ((i1) - 0x21) / 2 + 0x81;\
    if ((o1) > 0x9f) { (o1) += (0xe0 - 0xa0); }\
    if ((i1) & 1) {\
	(o2) = (i2) + (0x40 - 0x21);\
	if ((o2) > 0x7e) (o2)++;\
    } else {\
	(o2) = (i2) + (0xfc - 0x7e);\
    }\
}

static void
sjis_fputs(scanlk, stream, pretty)
     char *scanlk;
     FILE *stream;
     int pretty;
{
    unsigned char u1, u2, s1, s2;

    while (u1 = *scanlk++) {
	if (IS_JEUC(u1)) {
	    u2 = *scanlk;
	    if (IS_JEUC(u2)) {
		scanlk++;
		E2S(u1, u2, s1, s2);
		putc(s1, stream); putc(s2, stream);
		continue;
	    }
	    putc(' ', stream);
	} else if (u1 & 0x80) {
	    putc(' ', stream);
	} else if (pretty && iscntrl(u1)) {
	    cntrl_putc(u1, stream);
	} else {
	    putc(u1, stream);
	}
    }
}

#define DSGNT_JISX0208(stream, status) {\
    if (kanji_pos == ASCII) {\
	fputs("\033$B", (stream)); (status) = JISX0208;\
    }}\

#define DSGNT_ASCII(stream, status) {\
    if (kanji_pos == JISX0208) {\
	fputs("\033(B", (stream)); (status) = ASCII;\
    }}\

static void
jis7_fputs(scanlk, stream, pretty)
     char *scanlk;
     FILE *stream;
     int pretty;
{
    int kanji_pos;	/* ASCII or JISX0208 */
    unsigned char u1, u2;

    kanji_pos = ASCII;
    while (u1 = *scanlk++) {
	if (IS_JEUC(u1)) {
	    u2 = *scanlk;
	    if (IS_JEUC(u2)) {
		scanlk++;
		DSGNT_JISX0208(stream, kanji_pos);
		putc(u1 & 0x7f, stream); putc(u2 & 0x7f, stream);
		continue;
	    }
	    DSGNT_ASCII(stream, kanji_pos);
	    putc(' ', stream);
	} else if (u1 & 0x80) {
	    DSGNT_ASCII(stream, kanji_pos);
	    putc(' ', stream);
	} else if (pretty && iscntrl(u1)) {
	    DSGNT_ASCII(stream, kanji_pos);
	    cntrl_putc(u1, stream);
	} else {
	    DSGNT_ASCII(stream, kanji_pos);
	    putc(u1, stream);
	}
    }
    DSGNT_ASCII(stream, kanji_pos);
}

static void
cntrl_putc(c, stream)
     char c;
     FILE *stream;
{
    switch (c) {
      case '\b': putc('\\', stream); putc('b', stream); break;
      case '\f': putc('\\', stream); putc('f', stream); break;
      case '\n': putc('\\', stream); putc('n', stream); break;
      case '\r': putc('\\', stream); putc('r', stream); break;
      case '\t': putc('\\', stream); putc('t', stream); break;
      default:   putc('^', stream);  putc(c ^ 0x40, stream); break;
    }
}


/*
 * Input:
 */
char *
ml_conv(s)
     char *s;
{
    coding_system_t coding;
    
    if ((s == NULL) || ((coding = CSL_FILE(ml_coding_info)) == CS_NOCONV)) 
      return(s);
    
    ml_conv_sbr(s, coding, CE_UNKNOWN, 0);
    return(s);
}

char *
ml_conv_decode(s,encode,charset)
     char *s;
     int encode,charset;
{
    coding_system_t coding;
    
    if ((s == NULL) || ((coding = CSL_FILE(ml_coding_info)) == CS_NOCONV)) 
      return(s);
    
    ml_conv_sbr(s, coding, encode, charset);
    return(s);
}

/* 
   Convert to EUC
   shirinking only (?)
 */

static char *
cs_output_opt(int cs)
{
    switch(cs) {
    case CS_JIS7: return "-j";
    case CS_JEUC: return "-e";
    case CS_SJIS: return "-s";
    case CS_UTF8: return "-w80";
    case CS_NOCONV: return "-t";
    }
    return "-t";
} 

static char *
cs_input_opt(int cs, int encode, int input_charset)
{
    switch(encode) {
    case CE_BASE64: 
	if (input_charset==CS_UTF8)
	    return "-emBW8";
	return "-emB";
    case CE_QUOTED: 
	if (input_charset==CS_UTF8)
	    return "-emQW8";
	return "-emQ";
    }
    switch(cs) {
    case CS_JIS7: return "-Je";
    case CS_JEUC: return "-Ee";
    case CS_SJIS: return "-Se";
    case CS_UTF8: return "-W8e";
    case CS_NOCONV: return "-t";
    }
    return "-e";
} 

static void
ml_conv_sbr(in, cs, encode, input_charset)
     char *in;
     coding_system_t cs;
     int encode, input_charset;
{
    char *opt = cs_input_opt(cs,encode,input_charset);
    int len = strlen(in);
    nkf_open((unsigned char *)opt,(unsigned char *)in,len,(unsigned char *)in,len,extend,0);
    nkf_end();
    nkf_output[nkf_output_ctr]=0;
}

extern void
mime_convert(char *ptr)
{
    int len = strlen(ptr);
    nkf_open((unsigned char *)"-me",(unsigned char *)ptr,len,(unsigned char *)ptr,len,extend,0);
    nkf_end();
    nkf_output[nkf_output_ctr]=0;
}

#undef CR
#undef LIMIT
#undef PROTO

#undef getc
#undef ungetc

#define getc(f)   	nkf_getc(f)
#define ungetc(c,f)	nkf_ungetc(c,f)

static int
nkf_getc(FILE *f) {
    if (nkf_ilimit==-1) {
	int c = nkf_input[nkf_input_ctr++];
	if (c==0) {
	    nkf_input_ctr--; return -1;
	}
	return c;
    }
    return (nkf_input_ctr>=nkf_ilimit?-1:nkf_input[nkf_input_ctr++]);
}

static int
nkf_ungetc(int c,FILE *f) {
    nkf_input_ctr--;
    return c;
}


#undef putchar
#undef TRUE
#undef FALSE
#define putchar(c)	nkf_putchar(c)

#define debug nkf_debug

static
int
nkf_putchar(unsigned int c) 
{
    /* string length is enough? */
    if(nkf_output_ctr<nkf_limit && c!='\n') {
	return nkf_output[nkf_output_ctr++] = c;
    } else {
	nkf_output[nkf_output_ctr++] = c;
	nkf_flush(nkf_output,nkf_output_ctr);
	nkf_output_ctr = 0;
    }
    return c;
}


/* Include kanji filter main part */
/* getchar and putchar will be replaced during inclusion */

#define version  NKF_version
#define PERL_XS 1
#include "../nkf-utf8/utf8tbl.c"
#include "../nkf-utf8/nkf.c"

/*
      using opt ( "-w8" etc... )
      convert *in into *out
      when limit or nkf_end flush is called

      nkf_continue can be use to change input ptr
 */

static void
nkf_open(unsigned char *opt, unsigned char *in,int ilimit,
	unsigned char *out,int limit,void (*flush)(unsigned char *,int),FILE *fp) {
    /* Flags are reset at each call. */
    reinit();

    /* Process flags except the last once */
    options(opt);
    
    nkf_input_ctr = 0;
    nkf_input  = in;
    nkf_ilimit =  ilimit;

    nkf_file = fp;

    nkf_output_ctr = 0;
    nkf_output = out;

    nkf_limit =  limit;
    nkf_flush =  flush;

    /* Convestion */
    kanji_convert(NULL);
}

static void
nkf_continue(unsigned char *in,int ilimit) {
    nkf_input_ctr = 0;
    nkf_input  = in;
    nkf_ilimit =  ilimit;
    /* Convestion */
    kanji_convert(NULL);
}

void
nkf_end() {
    if (nkf_output_ctr) {
	nkf_flush(nkf_output,nkf_output_ctr);
	nkf_output_ctr = 0;
    }
}

static void
extend(unsigned char *out,int out_count)
{
    nkf_output += nkf_output_ctr;
}

static void
none(unsigned char *a,int b)
{
}

static void
my_flush(unsigned char *out,int out_count)
{ 
    fwrite(out,out_count,1,nkf_file);
}

static void
my_pretty(unsigned char *out,int out_count)
{ 
    int c;
    FILE *fp = nkf_file;
    while(out_count-->0) {
	c = *out++;
	if (iscntrl(c)) 
	    cntrl_putc(c, fp);
	else
	    putc(c, fp);  
    }
}

#endif /* JAPAN */

/* end */