diff --git a/config/locales/en.yml b/config/locales/en.yml index 2e72475a..ce7cd99d 100644 --- a/config/locales/en.yml +++ b/config/locales/en.yml @@ -23,6 +23,7 @@ en: nostdin: Do not read from stdin out_format: 'Output format: %{valid_text}' pattern_router: 'Use pattern router: keys moved per config data.write' + skip_interpolation: Skip translating strings that are straight interpolations (eg. "%{comment}") strict: >- Avoid inferring dynamic key usages such as t("cats.#{cat}.name"). Takes precedence over the config setting if set. @@ -112,8 +113,11 @@ en: missing: details_title: Value in other locales or source none: No translations are missing. + plural: Missing plural keys openai_translate: errors: + invalid_json: invalid JSON returned from backend. + invalid_size: 'Invalid size returned, expected: %{expected}, received: %{actual}.' no_api_key: >- Set OpenAI API key via OPENAI_API_KEY environment variable or translation.openai_api_key in config/i18n-tasks.yml. Get the key at https://openai.com/. diff --git a/config/locales/ru.yml b/config/locales/ru.yml index c6e7e4b2..c5e02271 100644 --- a/config/locales/ru.yml +++ b/config/locales/ru.yml @@ -22,6 +22,8 @@ ru: nostdin: Не читать дерево из стандартного ввода out_format: 'Формат вывода: %{valid_text}.' pattern_router: 'Использовать pattern_router: ключи распределятся по файлам согласно data.write' + skip_interpolation: Пропустить перевод строк, которые являются прямыми интерполяциями (например, + "%{comment}") strict: Не угадывать динамические использования ключей, например `t("category.#{category.key}")` translation_backend: Движок перевода [google, deepl, yandex, openai] value: >- @@ -111,8 +113,11 @@ ru: missing: details_title: На других языках или в коде none: Всё переведено. + plural: Отсутствуют ключи множественного числа openai_translate: errors: + invalid_json: недействительный JSON, возвращенный из бэкэнда + invalid_size: 'Возвращен неверный размер, ожидалось: %{expected}, получено: %{actual}.' no_api_key: |- Установить ключ API Яндекса с помощью переменной среды OPENAI_API_KEY или translation.openai_api_key в config / i18n-tasks.yml. Получите ключ по адресу https://openai.com/. diff --git a/lib/i18n/tasks/base_task.rb b/lib/i18n/tasks/base_task.rb index 96a74f79..d1a37c64 100644 --- a/lib/i18n/tasks/base_task.rb +++ b/lib/i18n/tasks/base_task.rb @@ -39,6 +39,9 @@ class BaseTask include Data include Stats + INTERPOLATION_KEY_RE = /%\{[^}]+}/.freeze + INTERPOLATION_ONLY_KEY_RE = /^%\{[^}]+\}$/.freeze + def initialize(config_file: nil, **config) @config_override = config_file self.config = config || {} diff --git a/lib/i18n/tasks/command/commands/missing.rb b/lib/i18n/tasks/command/commands/missing.rb index 06883a83..71de7a28 100644 --- a/lib/i18n/tasks/command/commands/missing.rb +++ b/lib/i18n/tasks/command/commands/missing.rb @@ -40,10 +40,19 @@ def missing(opt = {}) cmd :translate_missing, pos: '[locale ...]', desc: t('i18n_tasks.cmd.desc.translate_missing'), - args: [:locales, :locale_to_translate_from, arg(:out_format).from(1), :translation_backend, :pattern] + args: [ + :locales, :locale_to_translate_from, arg(:out_format).from(1), + :translation_backend, :pattern, :skip_interpolation + ] def translate_missing(opt = {}) - missing = i18n.missing_diff_forest opt[:locales], opt[:from] + missing = i18n.missing_keys( + locales: opt[:locales], + types: %w[diff plural], + base_locale: opt[:from], + skip_interpolation: (opt[:skip_interpolation] != 'false') + ) + if opt[:pattern] pattern_re = i18n.compile_key_pattern(opt[:pattern]) missing.select_keys! { |full_key, _node| full_key =~ pattern_re } diff --git a/lib/i18n/tasks/command/options/common.rb b/lib/i18n/tasks/command/options/common.rb index 554805d5..588a29d5 100644 --- a/lib/i18n/tasks/command/options/common.rb +++ b/lib/i18n/tasks/command/options/common.rb @@ -33,6 +33,11 @@ module Common '--config FILE', t('i18n_tasks.cmd.args.desc.config') + arg :skip_interpolation, + '-s', + '--skip_interpolation', + t('i18n_tasks.cmd.args.desc.skip_interpolation') + def arg_or_pos!(key, opts) opts[key] ||= opts[:arguments].try(:shift) end diff --git a/lib/i18n/tasks/missing_keys.rb b/lib/i18n/tasks/missing_keys.rb index d0ddf962..3a5de01d 100644 --- a/lib/i18n/tasks/missing_keys.rb +++ b/lib/i18n/tasks/missing_keys.rb @@ -19,12 +19,12 @@ def missing_keys_types # @param types [:used, :diff, :plural] all if `nil`. # @return [Siblings] - def missing_keys(locales: nil, types: nil, base_locale: nil) + def missing_keys(locales: nil, types: nil, base_locale: nil, skip_interpolation: false) locales ||= self.locales types ||= missing_keys_types base = base_locale || self.base_locale types.inject(empty_forest) do |f, type| - f.merge! send(:"missing_#{type}_forest", locales, base) + f.merge! send(:"missing_#{type}_forest", locales, base, skip_interpolation) end end @@ -35,47 +35,31 @@ def eq_base_keys(opts = {}) end end - def missing_diff_forest(locales, base = base_locale) + def missing_diff_forest(locales, base = base_locale, skip_interpolation = false) tree = empty_forest + # present in base but not locale (locales - [base]).each do |locale| - tree.merge! missing_diff_tree(locale, base) + tree.merge! missing_diff_tree(locale, base, skip_interpolation) end if locales.include?(base) # present in locale but not base (self.locales - [base]).each do |locale| - tree.merge! missing_diff_tree(base, locale) + tree.merge! missing_diff_tree(base, locale, skip_interpolation) end end tree end - def missing_used_forest(locales, _base = base_locale) + def missing_used_forest(locales, _base = base_locale, _skip_interpolation = false) locales.inject(empty_forest) do |forest, locale| forest.merge! missing_used_tree(locale) end end - def missing_plural_forest(locales, _base = base_locale) - locales.each_with_object(empty_forest) do |locale, forest| - required_keys = required_plural_keys_for_locale(locale) - next if required_keys.empty? - - tree = empty_forest - plural_nodes data[locale] do |node| - children = node.children - present_keys = Set.new(children.map { |c| c.key.to_sym }) - next if ignore_key?(node.full_key(root: false), :missing) - next if present_keys.superset?(required_keys) - - tree[node.full_key] = node.derive( - value: children.to_hash, - children: nil, - data: node.data.merge(missing_keys: (required_keys - present_keys).to_a) - ) - end - tree.set_root_key!(locale, type: :missing_plural) - forest.merge!(tree) + def missing_plural_forest(locales, base = base_locale, _skip_interpolation = false) + locales.inject(empty_forest) do |forest, locale| + forest.merge! missing_plural_tree(locale, base) end end @@ -93,9 +77,14 @@ def load_rails_i18n_pluralization!(locale) end # keys present in compared_to, but not in locale - def missing_diff_tree(locale, compared_to = base_locale) - data[compared_to].select_keys do |key, _node| - locale_key_missing? locale, depluralize_key(key, compared_to) + def missing_diff_tree(locale, compared_to = base_locale, skip_interpolation = false) + data[compared_to].select_keys do |key, node| + # Skip if the string is just an interpolation, eg: "%{body}" + if skip_interpolation && node.value.to_s.match?(BaseTask::INTERPOLATION_ONLY_KEY_RE) + false + else + locale_key_missing? locale, depluralize_key(key, compared_to) + end end.set_root_key!(locale, type: :missing_diff).keys do |_key, node| # change path and locale to base data = { locale: locale, missing_diff_locale: node.data[:locale] } @@ -106,6 +95,44 @@ def missing_diff_tree(locale, compared_to = base_locale) end end + def missing_plural_tree(locale, base = base_locale, _skip_interpolation = false) # rubocop:disable Metrics/AbcSize + tree = empty_forest + + required_keys = required_plural_keys_for_locale(locale) + return tree if required_keys.empty? + + plural_nodes data[base] do |node| + children = node.children + + # Get the existing translated node if it exists + node_key = node.full_key(root: false) + translated_node = data[locale].get("#{locale}.#{node_key}") + present_keys = if translated_node + # If it's a single existing (non-plural) translation, skip it + next unless translated_node.value.nil? + + # Otherwise get the existing plural keys + Set.new(translated_node.children.map { |c| c.key.to_sym }) + else + Set.new + end + # Compare the keys to those existing in base + next if ignore_key?(node.full_key(root: false), :missing, locale) + next if present_keys.superset?(required_keys) + + # Mark for removal any existing keys that are not required + base_keys = Set.new(node.children.map { |c| c.key.to_sym }) + remove_keys = (present_keys + base_keys) - required_keys + + tree[node.full_key] = node.derive( + value: children.to_hash, + children: nil, + data: node.data.merge(missing_keys: (required_keys - present_keys).to_a, remove_keys: remove_keys) + ) + end + tree.set_root_key!(locale, type: :missing_plural) + end + # keys used in the code missing translations in locale def missing_used_tree(locale) used_tree(strict: true).select_keys do |key, _node| diff --git a/lib/i18n/tasks/reports/terminal.rb b/lib/i18n/tasks/reports/terminal.rb index 3cf18606..1e2a1d81 100644 --- a/lib/i18n/tasks/reports/terminal.rb +++ b/lib/i18n/tasks/reports/terminal.rb @@ -116,7 +116,8 @@ def missing_key_info(leaf) when :missing_used first_occurrence leaf when :missing_plural - leaf[:data][:missing_keys].join(', ') + "#{Rainbow(I18n.t('i18n_tasks.missing.plural')).cyan} " + + leaf[:data][:missing_keys].join(', ') else "#{Rainbow(leaf[:data][:missing_diff_locale]).cyan} " \ "#{format_value(leaf[:value].is_a?(String) ? leaf[:value].strip : leaf[:value])}" diff --git a/lib/i18n/tasks/translators/base_translator.rb b/lib/i18n/tasks/translators/base_translator.rb index 3fea19ed..306c2781 100644 --- a/lib/i18n/tasks/translators/base_translator.rb +++ b/lib/i18n/tasks/translators/base_translator.rb @@ -14,6 +14,8 @@ def initialize(i18n_tasks) # @return [I18n::Tasks::Tree::Siblings] translated forest def translate_forest(forest, from) forest.inject @i18n_tasks.empty_forest do |result, root| + merge_missing_plural_nodes!(root) + translated = translate_pairs(root.key_values(root: true), to: root.key, from: from) result.merge! Data::Tree::Siblings.from_flat_pairs(translated) end @@ -21,6 +23,37 @@ def translate_forest(forest, from) protected + # Recursively loop through the tree until all :missing_plural nodes have + # been added to the base + # @param [I18n::Tasks::Data::Tree::Node] node + # @return [void] + def merge_missing_plural_nodes!(node) # rubocop:disable Metrics/AbcSize + return unless node.children? + + node.children.each do |child| + if child.data[:type] == :missing_plural + child.data.delete(:type) + # Add missing plural keys + child.data.delete(:missing_keys).each do |missing_key| + # If the key is present use it, otherwise use the 'other' value + other_value = child.value[missing_key.to_s] || child.value['other'] + new_node = Data::Tree::Node.new( + key: missing_key, + value: other_value, + parent: child.parent, + data: { type: :missing_plural_key, locale: node.root.key } + ) + child.append!(new_node) + end + # Remove uneeded plural keys + remove_keys = child.data.delete(:remove_keys) || Set.new + child.select_nodes! { |n| !remove_keys.include?(n.key.to_sym) } + else + merge_missing_plural_nodes!(child) + end + end + end + # @param [Array<[String, Object]>] list of key-value pairs # @return [Array<[String, Object]>] translated list def translate_pairs(list, opts) @@ -51,7 +84,7 @@ def fetch_translations(list, opts) # @param [Array<[String, Object]>] list of key-value pairs # @return [Array] values for translation extracted from list def to_values(list, opts) - list.map { |l| dump_value(l[1], opts) }.flatten.compact + list.map { |l| dump_value(l[1], opts) }.flatten(1).compact end # @param [Array<[String, Object]>] list @@ -104,14 +137,13 @@ def parse_value(untranslated, each_translated, opts) end end - INTERPOLATION_KEY_RE = /%\{[^}]+}/.freeze UNTRANSLATABLE_STRING = 'X__' # @param [String] value # @return [String] 'hello, %{name}' => 'hello, ' def replace_interpolations(value) i = -1 - value.gsub INTERPOLATION_KEY_RE do + value.gsub BaseTask::INTERPOLATION_KEY_RE do i += 1 "#{UNTRANSLATABLE_STRING}#{i}" end @@ -121,9 +153,9 @@ def replace_interpolations(value) # @param [String] translated # @return [String] 'hello, ' => 'hello, %{name}' def restore_interpolations(untranslated, translated) - return translated if untranslated !~ INTERPOLATION_KEY_RE + return translated if untranslated !~ BaseTask::INTERPOLATION_KEY_RE - values = untranslated.scan(INTERPOLATION_KEY_RE) + values = untranslated.scan(BaseTask::INTERPOLATION_KEY_RE) translated.gsub(/#{Regexp.escape(UNTRANSLATABLE_STRING)}\d+/i) do |m| values[m[UNTRANSLATABLE_STRING.length..].to_i] end diff --git a/lib/i18n/tasks/translators/openai_translator.rb b/lib/i18n/tasks/translators/openai_translator.rb index 2d842464..a98f0ef0 100644 --- a/lib/i18n/tasks/translators/openai_translator.rb +++ b/lib/i18n/tasks/translators/openai_translator.rb @@ -6,7 +6,7 @@ module I18n::Tasks::Translators class OpenAiTranslator < BaseTranslator # max allowed texts per request - BATCH_SIZE = 50 + BATCH_SIZE = 25 DEFAULT_SYSTEM_PROMPT = <<~PROMPT.squish You are a professional translator that translates content from the %{from} locale to the %{to} locale in an i18n locale array. @@ -16,6 +16,13 @@ class OpenAiTranslator < BaseTranslator HTML markups (enclosed in < and > characters) must not be changed under any circumstance. Variables (starting with %%{ and ending with }) must not be changed under any circumstance. + Any provided keys must not be changed under any circumstance. + + If a "X__" prefixed interpolation is given for the "other" plural key, you should try and + use that in the translation for any missing pluralizations keys that may represent + multiples. + The goal is to conform to the Unicode CLDR plural rules while maintaining a understandable + translation. Keep in mind the context of all the strings for a more accurate translation. PROMPT @@ -75,19 +82,48 @@ def translate_values(list, from:, to:) results = [] list.each_slice(BATCH_SIZE) do |batch| - translations = translate(batch, from, to) - - results << JSON.parse(translations) + results << translate_batch(batch, from, to) end results.flatten end - def translate(values, from, to) + def translate_batch(batch, from, to, retry_count = 0) + translations = translate(batch, from, to) + result = JSON.parse(remove_json_markdown(translations)) + + if result.size != batch.size + if retry_count < RETRIES + translate_batch(batch, from, to, retry_count) + elsif retry_count == RETRIES + # Try each string individually + batch.each do |string| + translate_batch([string], from, to, RETRIES + 1) + end + else + error = I18n.t('i18n_tasks.openai_translate.errors.invalid_size', expected: batch.size, actual: result.size) + fail ::I18n::Tasks::CommandError, error + end + end + + result + rescue JSON::ParserError + if retry_count < RETRIES + translate_batch([string], from, to, retry_count + 1) + else + raise ::I18n::Tasks::CommandError, I18n.t('i18n_tasks.openai_translate.errors.invalid_json') + end + end + + def remove_json_markdown(string) + string.gsub(/^```json\s*(.*?)\s*```$/m, '\1').strip + end + + def translate(values, from, to) # rubocop:disable Metrics/MethodLength messages = [ { role: 'system', - content: format(system_prompt, from: from, to: to) + content: format(system_prompt, from: from, to: to, size: values.size) }, { role: 'user', @@ -103,7 +139,8 @@ def translate(values, from, to) parameters: { model: model, messages: messages, - temperature: 0.0 + temperature: 0.0, + stream: false } ) diff --git a/spec/missing_keys_spec.rb b/spec/missing_keys_spec.rb index f1b79686..c8ab47cc 100644 --- a/spec/missing_keys_spec.rb +++ b/spec/missing_keys_spec.rb @@ -71,4 +71,67 @@ def configuration_from(locale) end end end + + describe '#missing_diff_forest(locale)' do + let(:task) { I18n::Tasks::BaseTask.new } + let(:base_keys) do + { + regular_key: 'a', + other_key: 'b', + + plural_key: { + one: 'one hat', + other: '%{count} hats', + zero: '%{count}' + }, + + ignored_pattern: { + plural_key: { + other: '%{count}' + } + } + } + end + + around do |ex| + TestCodebase.setup( + 'config/i18n-tasks.yml' => { + base_locale: 'en', + locales: %w[en ru], + ignore_missing: ['ignored_pattern.*'] + }.to_yaml, + 'config/locales/en.yml' => { en: base_keys }.to_yaml, + 'config/locales/ru.yml' => { ru: { regular_key: 'я' } }.to_yaml + ) + TestCodebase.in_test_app_dir { ex.call } + TestCodebase.teardown + end + + it 'returns a hash of missing keys' do + expected = { + 'ru' => { + 'other_key' => 'b', + 'plural_key' => { + 'one' => 'one hat', + 'other' => '%{count} hats', + 'zero' => '%{count}' + } + } + } + expect(task.missing_diff_forest(['ru']).to_hash).to eq(expected) + end + + it 'skips values that are just interpolations' do + expected = { + 'ru' => { + 'other_key' => 'b', + 'plural_key' => { + 'one' => 'one hat', + 'other' => '%{count} hats' + } + } + } + expect(task.missing_diff_forest(['ru'], 'en', true).to_hash).to eq(expected) + end + end end diff --git a/spec/plural_keys_spec.rb b/spec/plural_keys_spec.rb index c1273611..b25bc9d0 100644 --- a/spec/plural_keys_spec.rb +++ b/spec/plural_keys_spec.rb @@ -78,5 +78,16 @@ def depluralize(key) expect(leaves[1].full_key).to eq 'ar.nested.plural_key' expect(leaves[1].data[:missing_keys]).to eq %i[two few many] end + + it 'ignores keys with a single interpolation string' do + tree = build_tree(ru: { plural_key: '%{count}' }) + task.data['ru'].merge!(tree) + wrong = task.missing_plural_forest(%w[en ru]) + leaves = wrong.leaves.to_a + + expect(leaves.size).to eq 1 + expect(leaves[0].full_key).to eq 'ru.nested.plural_key' + expect(leaves[0].data[:missing_keys]).to eq %i[one few many other] + end end end diff --git a/spec/translator_spec.rb b/spec/translator_spec.rb new file mode 100644 index 00000000..44a98ccb --- /dev/null +++ b/spec/translator_spec.rb @@ -0,0 +1,143 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe 'Translation' do + let(:test_translator) do + Class.new(I18n::Tasks::Translators::BaseTranslator) do + private + + def translate_values(list, from:, to:, **options) + list.map do |value| + case value + when Array + translate_values(value, from: from, to: to, **options) + when String + "translated:#{value}" + end + end + end + + def options_for_html + {} + end + + def options_for_plain + {} + end + + def options_for_translate_values(options) + options + end + end + end + let(:task) { I18n::Tasks::BaseTask.new } + let(:base_keys) do + { + regular_key: 'a', + + plural_key: { + one: 'one', + other: '%{count}', + zero: 'zero' + }, + + not_really_plural: { + one: 'a', + green: 'b' + }, + + nested: { + plural_key: { + zero: 'none', + one: 'one', + other: '%{count}' + } + }, + + ignored_pattern: { + plural_key: { + other: '%{count}' + } + } + } + end + + before do + allow(task).to receive(:translate_forest) { |forest, args| + test_translator.new(task).translate_forest(forest, args[:from]) + } + end + + around do |ex| + TestCodebase.setup( + 'config/i18n-tasks.yml' => { + base_locale: 'en', + locales: %w[en ru], + translation_backend: :test, + ignore_missing: ['ignored_pattern.*'] + }.to_yaml, + 'config/locales/en.yml' => { en: base_keys }.to_yaml, + 'config/locales/ru.yml' => { + ru: base_keys.except(:plural_key).deep_merge({ nested: { plural_key: { many: 'existing' } } }) + }.to_yaml + ) + TestCodebase.in_test_app_dir { ex.call } + TestCodebase.teardown + end + + describe '#missing' do + let(:ru_hash) do + { + 'ru' => { + 'plural_key' => { + 'one' => 'translated:one', + 'few' => 'translated:%{count}', + 'many' => 'translated:%{count}', + 'other' => 'translated:%{count}' + }, + 'nested' => { + 'plural_key' => { + 'few' => 'translated:%{count}' + } + } + } + } + end + + it 'translates missing plural keys and removed unrequired' do + missing = task.missing_plural_forest(['ru'], 'en') + result = task.translate_forest(missing, from: 'en', backend: :test) + + expect(result.to_hash).to eq(ru_hash) + end + end + + describe 'multi-line' do + let(:ru_hash) do + { + 'ru' => { + 'multi_line' => { + 'basic' => ['translated:line 1', 'translated:line 2'], + 'interpolated' => ['translated:line %{count}', 'translated:line %{count}'] + } + } + } + end + + it 'translates multi-line values as a single entity' do + task.data[:en] = build_tree('en' => + { + 'multi_line' => { + 'basic' => ['line 1', 'line 2'], + 'interpolated' => ['line %{count}', 'line %{count}'] + } + }) + + missing = task.missing_keys(locales: ['ru'], base_locale: 'en') + result = task.translate_forest(missing, from: 'en', backend: :test) + + expect(result.to_hash).to eq(ru_hash) + end + end +end