From 16c43b3688be6e74cebcaabeda24c5eef63f7402 Mon Sep 17 00:00:00 2001 From: Andreas Lind Date: Thu, 26 Jul 2018 19:00:59 +0200 Subject: [PATCH] Look for the utf8 sentinel in a separate pass Means that it doesn't have to be the first parameter to affect the parsing of the entire query string. https://github.com/ljharb/qs/pull/268#issuecomment-407927573 --- lib/parse.js | 58 +++++++++++++++++++++++++++++++-------------------- test/parse.js | 5 +++++ 2 files changed, 40 insertions(+), 23 deletions(-) diff --git a/lib/parse.js b/lib/parse.js index 5ee22431..185a0429 100644 --- a/lib/parse.js +++ b/lib/parse.js @@ -8,12 +8,15 @@ var defaults = { allowDots: false, allowPrototypes: false, arrayLimit: 20, + charset: 'utf-8', decoder: utils.decode, delimiter: '&', depth: 5, + interpretNumericEntities: false, parameterLimit: 1000, plainObjects: false, - strictNullHandling: false + strictNullHandling: false, + utf8Sentinel: false }; var interpretNumericEntities = function (str) { @@ -25,15 +28,13 @@ var interpretNumericEntities = function (str) { // This is what browsers will submit when the ✓ character occurs in an // application/x-www-form-urlencoded body and the encoding of the page containing // the form is iso-8859-1, or when the submitted form has an accept-charset -// attribute of iso-8859-1. Presumably also with other charsets that does no contain +// attribute of iso-8859-1. Presumably also with other charsets that do not contain // the ✓ character, such as us-ascii. -var numericCheckmark = '✓'; +var isoSentinel = 'utf8=%26%2310003%3B'; // encodeURIComponent('✓') -// These are the raw utf-8 bytes of the checkmark as code points in a string. -// It's what we end up with when the utf-8 sentinel parameter is interpreted -// as iso-8859-1. When utf8Sentinel is enabled, we will use it to course-correct -// and interpret the rest of the query string as utf-8. -var misinterpretedCheckmark = '\xe2\x9c\x93'; +// These are the percent-encoded utf-8 octets representing a checkmark, indicating +// that the request actually is utf-8 encoded. +var utf8Sentinel = 'utf8=%E2%9C%93'; // encodeURIComponent('✓') var parseValues = function parseQueryStringValues(str, options) { var obj = {}; @@ -41,8 +42,27 @@ var parseValues = function parseQueryStringValues(str, options) { var limit = options.parameterLimit === Infinity ? undefined : options.parameterLimit; var parts = cleanStr.split(options.delimiter, limit); var charset = options.charset; + var skipIndex = -1; // Keep track of where the utf8 sentinel was found + var i; + + if (options.utf8Sentinel) { + for (i = 0; i < parts.length; ++i) { + if (parts[i].indexOf('utf8=') === 0) { + if (parts[i] === utf8Sentinel) { + charset = 'utf-8'; + } else if (parts[i] === isoSentinel) { + charset = 'iso-8859-1'; + } + skipIndex = i; + i = parts.length; // The eslint settings do not allow break; + } + } + } - for (var i = 0; i < parts.length; ++i) { + for (i = 0; i < parts.length; ++i) { + if (i === skipIndex) { + continue; + } var part = parts[i]; var bracketEqualsPos = part.indexOf(']='); @@ -57,21 +77,13 @@ var parseValues = function parseQueryStringValues(str, options) { val = options.decoder(part.slice(pos + 1), defaults.decoder, charset); } - if (key === 'utf8' && options.utf8Sentinel) { - if (val === '✓' || val === misinterpretedCheckmark) { - charset = 'utf-8'; - } else if (val === numericCheckmark) { - charset = 'iso-8859-1'; - } + if (options.interpretNumericEntities && charset === 'iso-8859-1') { + val = interpretNumericEntities(val); + } + if (has.call(obj, key)) { + obj[key] = [].concat(obj[key]).concat(val); } else { - if (options.interpretNumericEntities && charset === 'iso-8859-1') { - val = interpretNumericEntities(val); - } - if (has.call(obj, key)) { - obj[key] = [].concat(obj[key]).concat(val); - } else { - obj[key] = val; - } + obj[key] = val; } } diff --git a/test/parse.js b/test/parse.js index eff07e09..d5f43df1 100644 --- a/test/parse.js +++ b/test/parse.js @@ -597,6 +597,11 @@ test('parse()', function (t) { st.end(); }); + t.test('does not require the utf8 sentinel to be defined before the parameters whose decoding it affects', function (st) { + st.deepEqual(qs.parse('a=' + urlEncodedOSlashInUtf8 + '&utf8=' + urlEncodedNumCheckmark, { utf8Sentinel: true, charset: 'utf-8' }), { a: 'ø' }); + st.end(); + }); + t.test('should ignore an utf8 sentinel with an unknown value', function (st) { st.deepEqual(qs.parse('utf8=foo&' + urlEncodedOSlashInUtf8 + '=' + urlEncodedOSlashInUtf8, { utf8Sentinel: true, charset: 'utf-8' }), { ø: 'ø' }); st.end();