358 lines
7.4 KiB
C++
358 lines
7.4 KiB
C++
// go-encode-id.cc -- Go identifier and packagepath encoding/decoding hooks
|
|
|
|
// Copyright 2016 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
#include "go-system.h"
|
|
|
|
#include "gogo.h"
|
|
#include "go-location.h"
|
|
#include "go-linemap.h"
|
|
#include "go-encode-id.h"
|
|
#include "lex.h"
|
|
|
|
// Return whether the character c can appear in a name that we are
|
|
// encoding. We only permit ASCII alphanumeric characters.
|
|
|
|
static bool
|
|
char_needs_encoding(char c)
|
|
{
|
|
switch (c)
|
|
{
|
|
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
|
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
|
|
case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
|
|
case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
|
|
case 'Y': case 'Z':
|
|
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
|
|
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
|
|
case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
|
|
case 's': case 't': case 'u': case 'v': case 'w': case 'x':
|
|
case 'y': case 'z':
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
// Return whether the identifier needs to be translated because it
|
|
// contains non-ASCII characters.
|
|
|
|
bool
|
|
go_id_needs_encoding(const std::string& str)
|
|
{
|
|
for (std::string::const_iterator p = str.begin();
|
|
p != str.end();
|
|
++p)
|
|
if (char_needs_encoding(*p))
|
|
return true;
|
|
return false;
|
|
}
|
|
|
|
// Map from characters to the underscore encoding for them.
|
|
|
|
class Special_char_code
|
|
{
|
|
public:
|
|
Special_char_code();
|
|
|
|
// Return the simple underscore encoding for C, or 0 if none.
|
|
char
|
|
code_for(unsigned int c) const
|
|
{
|
|
if (c <= 127)
|
|
return this->codes_[c];
|
|
return 0;
|
|
}
|
|
|
|
private:
|
|
// Encodings for characters.
|
|
char codes_[128];
|
|
};
|
|
|
|
// Construct the underscore encoding map.
|
|
|
|
Special_char_code::Special_char_code()
|
|
{
|
|
memset(this->codes_, 0, sizeof this->codes_);
|
|
this->codes_['_'] = '_';
|
|
this->codes_['.'] = '0';
|
|
this->codes_['/'] = '1';
|
|
this->codes_['*'] = '2';
|
|
this->codes_[','] = '3';
|
|
this->codes_['{'] = '4';
|
|
this->codes_['}'] = '5';
|
|
this->codes_['['] = '6';
|
|
this->codes_[']'] = '7';
|
|
this->codes_['('] = '8';
|
|
this->codes_[')'] = '9';
|
|
this->codes_['"'] = 'a';
|
|
this->codes_[' '] = 'b';
|
|
this->codes_[';'] = 'c';
|
|
}
|
|
|
|
// The singleton Special_char_code.
|
|
|
|
static const Special_char_code special_char_code;
|
|
|
|
// Pull the next UTF-8 character out of P and store it in *PC. Return
|
|
// the number of bytes read.
|
|
|
|
static size_t
|
|
fetch_utf8_char(const char* p, unsigned int* pc)
|
|
{
|
|
unsigned char c = *p;
|
|
if ((c & 0x80) == 0)
|
|
{
|
|
*pc = c;
|
|
return 1;
|
|
}
|
|
size_t len = 0;
|
|
while ((c & 0x80) != 0)
|
|
{
|
|
++len;
|
|
c <<= 1;
|
|
}
|
|
unsigned int rc = *p & ((1 << (7 - len)) - 1);
|
|
for (size_t i = 1; i < len; i++)
|
|
{
|
|
unsigned int u = p[i];
|
|
rc <<= 6;
|
|
rc |= u & 0x3f;
|
|
}
|
|
*pc = rc;
|
|
return len;
|
|
}
|
|
|
|
// Encode an identifier using assembler-friendly characters. The
|
|
// encoding is described in detail near the end of the long comment at
|
|
// the start of names.cc.
|
|
|
|
std::string
|
|
go_encode_id(const std::string &id)
|
|
{
|
|
if (Lex::is_invalid_identifier(id))
|
|
{
|
|
go_assert(saw_errors());
|
|
return id;
|
|
}
|
|
|
|
std::string ret;
|
|
const char* p = id.c_str();
|
|
const char* pend = p + id.length();
|
|
|
|
// We encode a leading digit, to ensure that no identifier starts
|
|
// with a digit.
|
|
if (pend > p && p[0] >= '0' && p[0] <= '9')
|
|
{
|
|
char buf[8];
|
|
snprintf(buf, sizeof buf, "_x%02x", p[0]);
|
|
ret.append(buf);
|
|
++p;
|
|
}
|
|
|
|
while (p < pend)
|
|
{
|
|
unsigned int c;
|
|
size_t len = fetch_utf8_char(p, &c);
|
|
if (len == 1)
|
|
{
|
|
if (!char_needs_encoding(c))
|
|
ret.push_back(c);
|
|
else
|
|
{
|
|
char code = special_char_code.code_for(c);
|
|
if (code != 0)
|
|
{
|
|
ret.push_back('_');
|
|
ret.push_back(code);
|
|
}
|
|
else
|
|
{
|
|
char buf[8];
|
|
snprintf(buf, sizeof buf, "_x%02x", c);
|
|
ret.append(buf);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
char buf[16];
|
|
if (c < 0x10000)
|
|
snprintf(buf, sizeof buf, "_u%04x", c);
|
|
else
|
|
snprintf(buf, sizeof buf, "_U%08x", c);
|
|
ret.append(buf);
|
|
}
|
|
|
|
p += len;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
// Convert a hex digit string to a unicode codepoint. No checking
|
|
// to insure that the hex digit is meaningful.
|
|
|
|
static unsigned
|
|
hex_digits_to_unicode_codepoint(const char *digits, unsigned ndig)
|
|
{
|
|
unsigned result = 0;
|
|
for (unsigned i = 0; i < ndig; ++i) {
|
|
result <<= 4;
|
|
result |= Lex::hex_val(digits[i]);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
// Decode/demangle a mangled string produced by go_encode_id(). Returns
|
|
// empty string if demangling process fails in some way. At the moment
|
|
// this routine is unused; there is an equivalent routine in the runtime
|
|
// used for demangling symbols appearing in stack traces.
|
|
|
|
std::string
|
|
go_decode_id(const std::string &encoded)
|
|
{
|
|
std::string ret;
|
|
const char* p = encoded.c_str();
|
|
const char* pend = p + encoded.length();
|
|
const Location loc = Linemap::predeclared_location();
|
|
|
|
while (p < pend)
|
|
{
|
|
if (*p != '_' || p + 1 == pend)
|
|
{
|
|
ret.push_back(*p);
|
|
p++;
|
|
continue;
|
|
}
|
|
|
|
switch (p[1])
|
|
{
|
|
case '_':
|
|
ret.push_back('_');
|
|
p += 2;
|
|
break;
|
|
case '0':
|
|
ret.push_back('.');
|
|
p += 2;
|
|
break;
|
|
case '1':
|
|
ret.push_back('/');
|
|
p += 2;
|
|
break;
|
|
case '2':
|
|
ret.push_back('*');
|
|
p += 2;
|
|
break;
|
|
case '3':
|
|
ret.push_back(',');
|
|
p += 2;
|
|
break;
|
|
case '4':
|
|
ret.push_back('{');
|
|
p += 2;
|
|
break;
|
|
case '5':
|
|
ret.push_back('}');
|
|
p += 2;
|
|
break;
|
|
case '6':
|
|
ret.push_back('[');
|
|
p += 2;
|
|
break;
|
|
case '7':
|
|
ret.push_back(']');
|
|
p += 2;
|
|
break;
|
|
case '8':
|
|
ret.push_back('(');
|
|
p += 2;
|
|
break;
|
|
case '9':
|
|
ret.push_back(')');
|
|
p += 2;
|
|
break;
|
|
case 'a':
|
|
ret.push_back('"');
|
|
p += 2;
|
|
break;
|
|
case 'b':
|
|
ret.push_back(' ');
|
|
p += 2;
|
|
break;
|
|
case 'c':
|
|
ret.push_back(';');
|
|
p += 2;
|
|
break;
|
|
case 'x':
|
|
{
|
|
const char* digits = p + 2;
|
|
if (strlen(digits) < 2)
|
|
return "";
|
|
unsigned int rune = hex_digits_to_unicode_codepoint(digits, 2);
|
|
Lex::append_char(rune, true, &ret, loc);
|
|
p += 4;
|
|
}
|
|
break;
|
|
case 'u':
|
|
{
|
|
const char* digits = p + 2;
|
|
if (strlen(digits) < 4)
|
|
return "";
|
|
unsigned int rune = hex_digits_to_unicode_codepoint(digits, 4);
|
|
Lex::append_char(rune, true, &ret, loc);
|
|
p += 6;
|
|
}
|
|
break;
|
|
case 'U':
|
|
{
|
|
const char* digits = p + 2;
|
|
if (strlen(digits) < 8)
|
|
return "";
|
|
unsigned int rune = hex_digits_to_unicode_codepoint(digits, 8);
|
|
Lex::append_char(rune, true, &ret, loc);
|
|
p += 10;
|
|
}
|
|
break;
|
|
default:
|
|
return "";
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
// Encode a struct field tag. This is only used when we need to
|
|
// create a type descriptor for an anonymous struct type with field
|
|
// tags. Underscore encoding will be applied to the returned string.
|
|
// The tag will appear between curly braces, so that is all we have to
|
|
// avoid.
|
|
|
|
std::string
|
|
go_mangle_struct_tag(const std::string& tag)
|
|
{
|
|
std::string ret;
|
|
const char* p = tag.c_str();
|
|
const char* pend = p + tag.length();
|
|
while (p < pend)
|
|
{
|
|
unsigned int c;
|
|
size_t len = fetch_utf8_char(p, &c);
|
|
if (len > 1)
|
|
ret.append(p, len);
|
|
else if (c != '{' && c != '}' && c != '\\')
|
|
ret.push_back(c);
|
|
else
|
|
{
|
|
ret.push_back('\\');
|
|
ret.push_back(c);
|
|
}
|
|
p += len;
|
|
}
|
|
return ret;
|
|
}
|