Mercurial > hg > Members > anatofuz > MoarVM
view src/strings/parse_num.c @ 20:ae67093f0e62
fix code segment
author | Takahiro SHIMIZU <anatofuz@cr.ie.u-ryukyu.ac.jp> |
---|---|
date | Tue, 30 Oct 2018 18:40:24 +0900 |
parents | 2cf249471370 |
children |
line wrap: on
line source
#include "moar.h" #include <math.h> /* We put a ' ' into the current code point buffer when we reach the end of the string, * as it's something that can be harmlessly added to the end of a number */ #define END_OF_NUM ' ' static int is_whitespace(MVMThreadContext *tc, MVMCodepoint cp) { if (cp <= '~') { if (cp == ' ' || (cp <= 13 && cp >= 9)) return 1; else return 0; } return MVM_unicode_codepoint_has_property_value(tc, cp, MVM_UNICODE_PROPERTY_WHITE_SPACE, 1); } static int cp_value(MVMThreadContext *tc, MVMCodepoint cp) { if (cp >= '0' && cp <= '9') return cp - '0'; /* fast-path for ASCII 0..9 */ else if (cp >= 'a' && cp <= 'z') return cp - 'a' + 10; else if (cp >= 'A' && cp <= 'Z') return cp - 'A' + 10; else if (cp >= 0xFF21 && cp <= 0xFF3A) return cp - 0xFF21 + 10; /* uppercase fullwidth */ else if (cp >= 0xFF41 && cp <= 0xFF5A) return cp - 0xFF41 + 10; /* lowercase fullwidth */ else if (cp > 0 && MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_TYPE) == MVM_UNICODE_PVALUE_Numeric_Type_DECIMAL) { /* as of Unicode 9.0.0, characters with the 'de' Numeric Type (and are * thus also of General Category Nd, since 4.0.0) are contiguous * sequences of 10 chars whose Numeric Values ascend from 0 through 9. */ /* the string returned for NUMERIC_VALUE_NUMERATOR contains an integer * value. We can use numerator because they all are from 0-9 and have * denominator of 1 */ return fast_atoi(MVM_unicode_codepoint_get_property_cstr(tc, cp, MVM_UNICODE_PROPERTY_NUMERIC_VALUE_NUMERATOR)); } return -1; } int static get_cp(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) { if (!MVM_string_ci_has_more(tc, ci)) { *cp = END_OF_NUM; // FIXME pick a safe value return 1; } else { *cp = MVM_string_ci_get_codepoint(tc, ci); return 0; } } static void parse_error(MVMThreadContext *tc, MVMString *s, const char* reason) { char* got = MVM_string_utf8_c8_encode_C_string(tc, s); char *waste[] = { got, NULL }; MVM_exception_throw_adhoc_free(tc, waste, "Can't convert '%s' to num: %s", got, reason); } static void skip_whitespace(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) { while (is_whitespace(tc, *cp)) { if (get_cp(tc, ci, cp)) return; } } static int parse_sign(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) { // Handle any leading +/-/− sign int has_minus = (*cp == '-' || *cp == 8722); // '-', '−' if (has_minus || *cp == '+') { // '-', '−', '+' get_cp(tc, ci, cp); } return (has_minus ? -1 : 1); } static double parse_decimal_integer(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString* s) { int ends_with_underscore = 0; double value = 0; int digit; if (*cp == '_') parse_error(tc, s, "number can't be start with _"); while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { ends_with_underscore = *cp == '_'; if (*cp != '_') { if (digit >= 10) parse_error(tc, s, "expecting comma seprated decimal numbers after :$radix[]"); value = value * 10 + digit; } get_cp(tc, ci, cp); } if (ends_with_underscore) parse_error(tc, s, "a number can't end in underscore"); return value; } static double parse_int_frac_exp(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString* s, double radix, int leading_zero) { /* * What we do here is extract the digits from the original string, * effectively stripping off underscores and converting fancy Unicode * digits to regular ones. We then ASCII-fy those digits and stuff * them into digits_buf (along with double-ish things like the dot * and 'e'). At the end we give the resultant string to strtod() to * do all the dirty work for us, so we don't have to worry about * handling denormals or picking closest representable double */ int digits = 0; int frac_digits = 0; int digit; int ends_with_underscore = 0; char *digits_buf = (char *)MVM_malloc(1 + MVM_string_graphs(tc, s)); char *digits_buf_tail = digits_buf; double result; if (*cp == '_') parse_error(tc, s, "number can't start with _"); if (*cp != '.') { while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { ends_with_underscore = *cp == '_'; if (*cp != '_') { if (digit >= radix) break; *digits_buf_tail++ = '0' + digit; digits++; } get_cp(tc, ci, cp); } if (ends_with_underscore) parse_error(tc, s, "a number can't end in underscore"); } if (*cp == '.') { *digits_buf_tail++ = '.'; get_cp(tc, ci, cp); if (*cp == '_') parse_error(tc, s, "radix point can't be followed by _"); while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { ends_with_underscore = *cp == '_'; if (*cp != '_') { if (digit >= radix) break; *digits_buf_tail++ = '0' + digit; frac_digits++; } get_cp(tc, ci, cp); } if (frac_digits == 0) parse_error(tc, s, "radix point must be followed by one or more valid digits"); if (ends_with_underscore) parse_error(tc, s, "a number can't end in underscore"); } if (digits == 0 && frac_digits == 0 && !leading_zero) parse_error(tc, s, "expecting a number"); if (*cp == 'E' || *cp == 'e') { int e_digits = 0; *digits_buf_tail++ = 'e'; get_cp(tc, ci, cp); if (parse_sign(tc, ci, cp) == -1) *digits_buf_tail++ = '-'; if (*cp == '_') parse_error(tc, s, "'e' or 'E' can't be followed by _"); while (*cp == '_' || (digit = cp_value(tc, *cp)) != -1) { if (*cp != '_') { if (digit >= radix) break; *digits_buf_tail++ = '0' + digit; e_digits++; } get_cp(tc, ci, cp); } if (e_digits == 0) parse_error(tc, s, "'e' or 'E' must be followed by one or more valid digits"); } *digits_buf_tail = '\0'; result = strtod(digits_buf, NULL); MVM_free(digits_buf); return result; } static int match_word(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, char word[3], MVMString *s) { if (*cp == word[0]) { get_cp(tc, ci, cp); if (*cp == word[1]) { get_cp(tc, ci, cp); if (*cp == word[2]) { get_cp(tc, ci, cp); return 1; } else { parse_error(tc, s, "that's not a number"); } } else { parse_error(tc, s, "that's not a number"); } } return 0; } static double parse_simple_number(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString *s) { double sign; // Handle NaN here, to make later parsing simpler if (match_word(tc, ci, cp, "NaN", s)) { return MVM_num_nan(tc); } sign = parse_sign(tc, ci, cp); if (match_word(tc, ci, cp, "Inf", s)) { return sign * MVM_num_posinf(tc); } else if (*cp == ':') { int radix; double body; get_cp(tc, ci, cp); radix = (int) parse_int_frac_exp(tc, ci, cp, s, 10, 0); if (*cp == '<') { get_cp(tc, ci, cp); body = parse_int_frac_exp(tc, ci, cp, s, radix, 0); if (*cp == '>') { get_cp(tc, ci, cp); return sign * body; } else { parse_error(tc, s, "malformed ':radix<>' style radix number, expecting '>' after the body"); } } else if (*cp == 171) { // « get_cp(tc, ci, cp); body = parse_int_frac_exp(tc, ci, cp, s, radix, 0); if (*cp == 187) { // » get_cp(tc, ci, cp); return sign * body; } else { parse_error(tc, s, "malformed ':radix«»' style radix number, expecting '>' after the body"); } } else if (*cp == '[') { // « double result = 0; get_cp(tc, ci, cp); while (*cp != ']' && MVM_string_ci_has_more(tc, ci)) { double digit = parse_decimal_integer(tc, ci, cp, s); result = result * radix + digit; if (*cp == ',') { get_cp(tc, ci, cp); } } if (*cp == ']') { // » get_cp(tc, ci, cp); return sign * result; } else { parse_error(tc, s, "malformed ':radix[]' style radix number, expecting ']' after the body"); } } } else if (*cp == '0') { int radix = 0; get_cp(tc, ci, cp); if (*cp == 'b') radix = 2; else if (*cp == 'o') radix = 8; else if (*cp == 'd') radix = 10; else if (*cp == 'x') radix = 16; if (radix) { get_cp(tc, ci, cp); if (*cp == '_') get_cp(tc, ci, cp); return sign * parse_int_frac_exp(tc, ci, cp, s, radix, 1); } else { return sign * parse_int_frac_exp(tc, ci, cp, s, 10, 1); } } else { return sign * parse_int_frac_exp(tc, ci, cp, s, 10, 0); } } static double parse_real(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp, MVMString *s) { double result = parse_simple_number(tc, ci, cp, s); double denom; // Check for '/' indicating Rat denominator if (*cp == '/') { get_cp(tc, ci, cp); denom = parse_simple_number(tc, ci, cp, s); result = result / denom; } return result; } MVMnum64 MVM_coerce_s_n(MVMThreadContext *tc, MVMString *s) { MVMCodepointIter ci; MVMCodepoint cp; MVMnum64 n = 123; MVM_string_ci_init(tc, &ci, s, 0, 0); if (get_cp(tc, &ci, &cp)) return 0; skip_whitespace(tc, &ci, &cp); // Do we have only whitespace if (!MVM_string_ci_has_more(tc, &ci) && cp == END_OF_NUM) { return 0; } n = parse_real(tc, &ci, &cp, s); skip_whitespace(tc, &ci, &cp); if (MVM_string_ci_has_more(tc, &ci) || cp != END_OF_NUM) { parse_error(tc, s, "trailing characters"); } return n; }