Add lemma translation (#278)

MuhammadAbdullaah · web-flow · commit 4c37c2753f99 · 2025-05-06T18:47:53.000+05:00
* add rake task to import lemma translations from lemmas.txt

* minor fix

* remove lemmas.txt file
diff --git a/app/admin/grammar/lemma.rb b/app/admin/grammar/lemma.rb
@@ -4,12 +4,13 @@
   menu parent: 'Grammar'
   actions :all, except: :destroy
 
-  permit_params :text_madani, :text_clean, :words_count, :uniq_words_count
+  permit_params :text_madani, :text_clean, :words_count, :uniq_words_count, :en_translations
 
   filter :text_clean
   filter :text_madani
   filter :words_count
   filter :uniq_words_count
+  filter :en_translations
 
   searchable_select_options(
     scope: Lemma,
@@ -33,6 +34,9 @@
     end
     column :words_count
     column :uniq_words_count
+    column :en_translations do |resource|
+      resource.en_translations.present? ? resource.en_translations.join(', ') : status_tag('None')
+    end
     actions
   end
 
@@ -44,13 +48,24 @@
           resource.text_madani
         end
       end
-      row :text_clean do
+      row :text_clean do |resource|
         span class: 'qpc-hafs' do
           resource.text_clean
         end
       end
       row :words_count
       row :uniq_words_count
+      row :en_translations do |resource|
+        if resource.en_translations.present?
+          ul do
+            resource.en_translations.each do |t|
+              li t
+            end
+          end
+        else
+          status_tag 'None'
+        end
+      end
       row :created_at
       row :updated_at
     end
diff --git a/app/models/lemma.rb b/app/models/lemma.rb
@@ -3,6 +3,7 @@
 # Table name: lemmas
 #
 #  id               :integer          not null, primary key
+#  en_translations  :jsonb            not null
 #  text_clean       :string
 #  text_madani      :string
 #  uniq_words_count :integer
diff --git a/db/migrate/20250505072322_add_en_translations_to_lemmas.rb b/db/migrate/20250505072322_add_en_translations_to_lemmas.rb
@@ -0,0 +1,6 @@
+class AddEnTranslationsToLemmas < ActiveRecord::Migration[7.0]
+  def change
+    c = Lemma.connection
+    c.add_column :lemmas, :en_translations, :jsonb, default: [], null: false
+  end
+end
diff --git a/lib/tasks/import_lemmas.rake b/lib/tasks/import_lemmas.rake
@@ -0,0 +1,73 @@
+namespace :import do
+  desc "Import English translations from lemmas.txt into Lemma.en_translations (jsonb array)"
+  task import_lemma_translations: :environment do
+    file_path = Rails.root.join('lib', 'data', 'lemmas.txt')
+    entries   = File.readlines(file_path, chomp: true).reject(&:blank?)
+
+    found_count      = 0
+    not_found_count  = 0
+    duplicate_count  = 0
+    unmatched_keys   = []
+
+    puts "\nStarting import of English translations...\n\n"
+
+    entries.each_with_index do |line, idx|
+      buckwalter, eng = line.split("\t", 2).map(&:strip)
+      next unless buckwalter.present? && eng.present?
+
+      arabic = convert_buckwalter_to_arabic(buckwalter)
+      clean  = remove_diacritics(arabic)
+
+      lemma = Lemma.find_by(text_clean: clean)
+      if lemma
+        existing = lemma.en_translations || []
+        if existing.include?(eng)
+          duplicate_count += 1
+        else
+          existing << eng
+          lemma.update!(en_translations: existing)
+          found_count += 1
+        end
+      else
+        not_found_count += 1
+        unmatched_keys << { index: idx + 1, buckwalter: buckwalter, clean: clean }
+      end
+    end
+
+    puts "\n Import Summary:"
+    puts "   Total entries       : #{entries.size}"
+    puts "   Successfully added  : #{found_count}"
+    puts "   Duplicates skipped  : #{duplicate_count}"
+    puts "   Not found in DB     : #{not_found_count}"
+
+    if unmatched_keys.any?
+      puts "\n Unmatched entries:"
+      unmatched_keys.each do |u|
+        puts " - Line ##{u[:index]}: Buckwalter='#{u[:buckwalter]}', Clean='#{u[:clean]}'"
+      end
+    end
+
+    puts "\n Import task complete!"
+  end
+
+  def convert_buckwalter_to_arabic(bw)
+    mapping = {
+      "'" => '', '>' => 'ا', '<' => 'ا', "&" => '', "}" => '', "{" => 'ا',
+      "A" => 'ا', "b" => 'ب', "t" => 'ت', "v" => 'ث', "j" => 'ج',
+      "H" => 'ح', "x" => 'خ', "d" => 'د', "*" => 'ذ', "r" => 'ر',
+      "z" => 'ز', "s" => 'س', "$" => 'ش', "S" => 'ص', "D" => 'ض',
+      "T" => 'ط', "Z" => 'ظ', "E" => 'ع', "g" => 'غ', "f" => 'ف',
+      "q" => 'ق', "k" => 'ك', "l" => 'ل', "m" => 'م', "n" => 'ن',
+      "h" => 'ه', "w" => 'و', "Y" => 'ى', "y" => 'ي',
+      /[FNKaui~o^#`_:;,\.\!\-\+\%\]\[]/ => ''
+    }
+    bw.chars.map { |c|
+      map_key = mapping.keys.find { |k| k.is_a?(Regexp) ? k.match?(c) : k == c }
+      mapping[map_key] || c
+    }.join
+  end
+
+  def remove_diacritics(str)
+    str.gsub(/[\u064B-\u065F\u0670]/, '').gsub('ـ', '')
+  end
+end

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`# Table name: lemmas`
`4`	`4`	`#`
`5`	`5`	`# id :integer not null, primary key`
	`6`	`+# en_translations :jsonb not null`
`6`	`7`	`# text_clean :string`
`7`	`8`	`# text_madani :string`
`8`	`9`	`# uniq_words_count :integer`