Skip to content

Commit

Permalink
Look for the utf8 sentinel in a separate pass
Browse files Browse the repository at this point in the history
Means that it doesn't have to be the first parameter to affect
the parsing of the entire query string.

ljharb#268 (comment)
  • Loading branch information
papandreou authored and ljharb committed Jul 26, 2018
1 parent 4bfb4d1 commit 16c43b3
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 23 deletions.
58 changes: 35 additions & 23 deletions lib/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ var defaults = {
allowDots: false,
allowPrototypes: false,
arrayLimit: 20,
charset: 'utf-8',
decoder: utils.decode,
delimiter: '&',
depth: 5,
interpretNumericEntities: false,
parameterLimit: 1000,
plainObjects: false,
strictNullHandling: false
strictNullHandling: false,
utf8Sentinel: false
};

var interpretNumericEntities = function (str) {
Expand All @@ -25,24 +28,41 @@ var interpretNumericEntities = function (str) {
// This is what browsers will submit when the ✓ character occurs in an
// application/x-www-form-urlencoded body and the encoding of the page containing
// the form is iso-8859-1, or when the submitted form has an accept-charset
// attribute of iso-8859-1. Presumably also with other charsets that does no contain
// attribute of iso-8859-1. Presumably also with other charsets that do not contain
// the ✓ character, such as us-ascii.
var numericCheckmark = '✓';
var isoSentinel = 'utf8=%26%2310003%3B'; // encodeURIComponent('✓')

// These are the raw utf-8 bytes of the checkmark as code points in a string.
// It's what we end up with when the utf-8 sentinel parameter is interpreted
// as iso-8859-1. When utf8Sentinel is enabled, we will use it to course-correct
// and interpret the rest of the query string as utf-8.
var misinterpretedCheckmark = '\xe2\x9c\x93';
// These are the percent-encoded utf-8 octets representing a checkmark, indicating
// that the request actually is utf-8 encoded.
var utf8Sentinel = 'utf8=%E2%9C%93'; // encodeURIComponent('✓')

var parseValues = function parseQueryStringValues(str, options) {
var obj = {};
var cleanStr = options.ignoreQueryPrefix ? str.replace(/^\?/, '') : str;
var limit = options.parameterLimit === Infinity ? undefined : options.parameterLimit;
var parts = cleanStr.split(options.delimiter, limit);
var charset = options.charset;
var skipIndex = -1; // Keep track of where the utf8 sentinel was found
var i;

if (options.utf8Sentinel) {
for (i = 0; i < parts.length; ++i) {
if (parts[i].indexOf('utf8=') === 0) {
if (parts[i] === utf8Sentinel) {
charset = 'utf-8';
} else if (parts[i] === isoSentinel) {
charset = 'iso-8859-1';
}
skipIndex = i;
i = parts.length; // The eslint settings do not allow break;
}
}
}

for (var i = 0; i < parts.length; ++i) {
for (i = 0; i < parts.length; ++i) {
if (i === skipIndex) {
continue;
}
var part = parts[i];

var bracketEqualsPos = part.indexOf(']=');
Expand All @@ -57,21 +77,13 @@ var parseValues = function parseQueryStringValues(str, options) {
val = options.decoder(part.slice(pos + 1), defaults.decoder, charset);
}

if (key === 'utf8' && options.utf8Sentinel) {
if (val === '✓' || val === misinterpretedCheckmark) {
charset = 'utf-8';
} else if (val === numericCheckmark) {
charset = 'iso-8859-1';
}
if (options.interpretNumericEntities && charset === 'iso-8859-1') {
val = interpretNumericEntities(val);
}
if (has.call(obj, key)) {
obj[key] = [].concat(obj[key]).concat(val);
} else {
if (options.interpretNumericEntities && charset === 'iso-8859-1') {
val = interpretNumericEntities(val);
}
if (has.call(obj, key)) {
obj[key] = [].concat(obj[key]).concat(val);
} else {
obj[key] = val;
}
obj[key] = val;
}
}

Expand Down
5 changes: 5 additions & 0 deletions test/parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,11 @@ test('parse()', function (t) {
st.end();
});

t.test('does not require the utf8 sentinel to be defined before the parameters whose decoding it affects', function (st) {
st.deepEqual(qs.parse('a=' + urlEncodedOSlashInUtf8 + '&utf8=' + urlEncodedNumCheckmark, { utf8Sentinel: true, charset: 'utf-8' }), { a: 'ø' });
st.end();
});

t.test('should ignore an utf8 sentinel with an unknown value', function (st) {
st.deepEqual(qs.parse('utf8=foo&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { utf8Sentinel: true, charset: 'utf-8' }), { ø: 'ø' });
st.end();
Expand Down

0 comments on commit 16c43b3

Please sign in to comment.