Skip to content

Translation

wmt20_en_ja

This dataset is created as a test set for the WMT20 shared task on news translation. This is English to Japanese translation.

References:

  • Data Source
  • 2020 Fifth Conference on Machine Translation (WMT20)
    local dataset = {
      class_path: 'SacreBleuDataset',
      init_args: { name: 'wmt20', langpair: 'en-ja' },
    };
    
    {
      class_path: 'Generation',
      init_args: {
        eval_dataset: dataset,
        few_shot_generator: {
          class_path: 'RandomFewShotGenerator',
          init_args: {
            // Use the eval dataset for few-shot data,
            // but `RandomFewShotGenerator` will avoid using the same few-shot isntances as the input.
            dataset: dataset,
            num_shots: 4,
          },
        },
        prompt_template: |||
          {% for item in few_shot_data %}
          En: `{{ item.source }}`
          Ja: `{{ item.references[0] }}`
          {% endfor %}
          En: `{{ source }}`
        ||| + 'Ja: `',
        metrics: [
          { class_path: 'BLEU', init_args: { tokenize_option: 'ja-mecab' } },
        ],
        gen_kwargs: { max_new_tokens: 128, stop_sequences: ['`'] },
        batch_size: 4,
      },
    }
    

wmt20_ja_en

This dataset is created as a test set for the WMT20 shared task on news translation. This is Japanese to English translation.

References:

  • Data Source
  • 2020 Fifth Conference on Machine Translation (WMT20)
    local dataset = {
      class_path: 'SacreBleuDataset',
      init_args: { name: 'wmt20', langpair: 'ja-en' },
    };
    
    {
      class_path: 'Generation',
      init_args: {
        eval_dataset: dataset,
        few_shot_generator: {
          class_path: 'RandomFewShotGenerator',
          init_args: {
            // Use the eval dataset for few-shot data,
            // but `RandomFewShotGenerator` will avoid using the same few-shot isntances as the input.
            dataset: dataset,
            num_shots: 4,
          },
        },
        prompt_template: |||
          {% for item in few_shot_data %}
          Ja: `{{ item.source }}`
          En: `{{ item.references[0] }}`
          {% endfor %}
          Ja: `{{ source }}`
        ||| + 'En: `',
        metrics: [
          { class_path: 'BLEU', init_args: { tokenize_option: 'intl' } },
        ],
        gen_kwargs: { max_new_tokens: 128, stop_sequences: ['`'] },
        batch_size: 4,
      },
    }