|
| 1 | +namespace :import do |
| 2 | + desc "Import English translations from lemmas.txt into Lemma.en_translations (jsonb array)" |
| 3 | + task import_lemma_translations: :environment do |
| 4 | + file_path = Rails.root.join('lib', 'data', 'lemmas.txt') |
| 5 | + entries = File.readlines(file_path, chomp: true).reject(&:blank?) |
| 6 | + |
| 7 | + found_count = 0 |
| 8 | + not_found_count = 0 |
| 9 | + duplicate_count = 0 |
| 10 | + unmatched_keys = [] |
| 11 | + |
| 12 | + puts "\nStarting import of English translations...\n\n" |
| 13 | + |
| 14 | + entries.each_with_index do |line, idx| |
| 15 | + buckwalter, eng = line.split("\t", 2).map(&:strip) |
| 16 | + next unless buckwalter.present? && eng.present? |
| 17 | + |
| 18 | + arabic = convert_buckwalter_to_arabic(buckwalter) |
| 19 | + clean = remove_diacritics(arabic) |
| 20 | + |
| 21 | + lemma = Lemma.find_by(text_clean: clean) |
| 22 | + if lemma |
| 23 | + existing = lemma.en_translations || [] |
| 24 | + if existing.include?(eng) |
| 25 | + duplicate_count += 1 |
| 26 | + else |
| 27 | + existing << eng |
| 28 | + lemma.update!(en_translations: existing) |
| 29 | + found_count += 1 |
| 30 | + end |
| 31 | + else |
| 32 | + not_found_count += 1 |
| 33 | + unmatched_keys << { index: idx + 1, buckwalter: buckwalter, clean: clean } |
| 34 | + end |
| 35 | + end |
| 36 | + |
| 37 | + puts "\n Import Summary:" |
| 38 | + puts " Total entries : #{entries.size}" |
| 39 | + puts " Successfully added : #{found_count}" |
| 40 | + puts " Duplicates skipped : #{duplicate_count}" |
| 41 | + puts " Not found in DB : #{not_found_count}" |
| 42 | + |
| 43 | + if unmatched_keys.any? |
| 44 | + puts "\n Unmatched entries:" |
| 45 | + unmatched_keys.each do |u| |
| 46 | + puts " - Line ##{u[:index]}: Buckwalter='#{u[:buckwalter]}', Clean='#{u[:clean]}'" |
| 47 | + end |
| 48 | + end |
| 49 | + |
| 50 | + puts "\n Import task complete!" |
| 51 | + end |
| 52 | + |
| 53 | + def convert_buckwalter_to_arabic(bw) |
| 54 | + mapping = { |
| 55 | + "'" => '', '>' => 'ا', '<' => 'ا', "&" => '', "}" => '', "{" => 'ا', |
| 56 | + "A" => 'ا', "b" => 'ب', "t" => 'ت', "v" => 'ث', "j" => 'ج', |
| 57 | + "H" => 'ح', "x" => 'خ', "d" => 'د', "*" => 'ذ', "r" => 'ر', |
| 58 | + "z" => 'ز', "s" => 'س', "$" => 'ش', "S" => 'ص', "D" => 'ض', |
| 59 | + "T" => 'ط', "Z" => 'ظ', "E" => 'ع', "g" => 'غ', "f" => 'ف', |
| 60 | + "q" => 'ق', "k" => 'ك', "l" => 'ل', "m" => 'م', "n" => 'ن', |
| 61 | + "h" => 'ه', "w" => 'و', "Y" => 'ى', "y" => 'ي', |
| 62 | + /[FNKaui~o^#`_:;,\.\!\-\+\%\]\[]/ => '' |
| 63 | + } |
| 64 | + bw.chars.map { |c| |
| 65 | + map_key = mapping.keys.find { |k| k.is_a?(Regexp) ? k.match?(c) : k == c } |
| 66 | + mapping[map_key] || c |
| 67 | + }.join |
| 68 | + end |
| 69 | + |
| 70 | + def remove_diacritics(str) |
| 71 | + str.gsub(/[\u064B-\u065F\u0670]/, '').gsub('ـ', '') |
| 72 | + end |
| 73 | +end |
0 commit comments