Skip to content

Commit 4c37c27

Browse files
Add lemma translation (#278)
* add rake task to import lemma translations from lemmas.txt * minor fix * remove lemmas.txt file
1 parent 4aba881 commit 4c37c27

File tree

4 files changed

+97
-2
lines changed

4 files changed

+97
-2
lines changed

app/admin/grammar/lemma.rb

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,13 @@
44
menu parent: 'Grammar'
55
actions :all, except: :destroy
66

7-
permit_params :text_madani, :text_clean, :words_count, :uniq_words_count
7+
permit_params :text_madani, :text_clean, :words_count, :uniq_words_count, :en_translations
88

99
filter :text_clean
1010
filter :text_madani
1111
filter :words_count
1212
filter :uniq_words_count
13+
filter :en_translations
1314

1415
searchable_select_options(
1516
scope: Lemma,
@@ -33,6 +34,9 @@
3334
end
3435
column :words_count
3536
column :uniq_words_count
37+
column :en_translations do |resource|
38+
resource.en_translations.present? ? resource.en_translations.join(', ') : status_tag('None')
39+
end
3640
actions
3741
end
3842

@@ -44,13 +48,24 @@
4448
resource.text_madani
4549
end
4650
end
47-
row :text_clean do
51+
row :text_clean do |resource|
4852
span class: 'qpc-hafs' do
4953
resource.text_clean
5054
end
5155
end
5256
row :words_count
5357
row :uniq_words_count
58+
row :en_translations do |resource|
59+
if resource.en_translations.present?
60+
ul do
61+
resource.en_translations.each do |t|
62+
li t
63+
end
64+
end
65+
else
66+
status_tag 'None'
67+
end
68+
end
5469
row :created_at
5570
row :updated_at
5671
end

app/models/lemma.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Table name: lemmas
44
#
55
# id :integer not null, primary key
6+
# en_translations :jsonb not null
67
# text_clean :string
78
# text_madani :string
89
# uniq_words_count :integer
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class AddEnTranslationsToLemmas < ActiveRecord::Migration[7.0]
2+
def change
3+
c = Lemma.connection
4+
c.add_column :lemmas, :en_translations, :jsonb, default: [], null: false
5+
end
6+
end

lib/tasks/import_lemmas.rake

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
namespace :import do
2+
desc "Import English translations from lemmas.txt into Lemma.en_translations (jsonb array)"
3+
task import_lemma_translations: :environment do
4+
file_path = Rails.root.join('lib', 'data', 'lemmas.txt')
5+
entries = File.readlines(file_path, chomp: true).reject(&:blank?)
6+
7+
found_count = 0
8+
not_found_count = 0
9+
duplicate_count = 0
10+
unmatched_keys = []
11+
12+
puts "\nStarting import of English translations...\n\n"
13+
14+
entries.each_with_index do |line, idx|
15+
buckwalter, eng = line.split("\t", 2).map(&:strip)
16+
next unless buckwalter.present? && eng.present?
17+
18+
arabic = convert_buckwalter_to_arabic(buckwalter)
19+
clean = remove_diacritics(arabic)
20+
21+
lemma = Lemma.find_by(text_clean: clean)
22+
if lemma
23+
existing = lemma.en_translations || []
24+
if existing.include?(eng)
25+
duplicate_count += 1
26+
else
27+
existing << eng
28+
lemma.update!(en_translations: existing)
29+
found_count += 1
30+
end
31+
else
32+
not_found_count += 1
33+
unmatched_keys << { index: idx + 1, buckwalter: buckwalter, clean: clean }
34+
end
35+
end
36+
37+
puts "\n Import Summary:"
38+
puts " Total entries : #{entries.size}"
39+
puts " Successfully added : #{found_count}"
40+
puts " Duplicates skipped : #{duplicate_count}"
41+
puts " Not found in DB : #{not_found_count}"
42+
43+
if unmatched_keys.any?
44+
puts "\n Unmatched entries:"
45+
unmatched_keys.each do |u|
46+
puts " - Line ##{u[:index]}: Buckwalter='#{u[:buckwalter]}', Clean='#{u[:clean]}'"
47+
end
48+
end
49+
50+
puts "\n Import task complete!"
51+
end
52+
53+
def convert_buckwalter_to_arabic(bw)
54+
mapping = {
55+
"'" => '', '>' => 'ا', '<' => 'ا', "&" => '', "}" => '', "{" => 'ا',
56+
"A" => 'ا', "b" => 'ب', "t" => 'ت', "v" => 'ث', "j" => 'ج',
57+
"H" => 'ح', "x" => 'خ', "d" => 'د', "*" => 'ذ', "r" => 'ر',
58+
"z" => 'ز', "s" => 'س', "$" => 'ش', "S" => 'ص', "D" => 'ض',
59+
"T" => 'ط', "Z" => 'ظ', "E" => 'ع', "g" => 'غ', "f" => 'ف',
60+
"q" => 'ق', "k" => 'ك', "l" => 'ل', "m" => 'م', "n" => 'ن',
61+
"h" => 'ه', "w" => 'و', "Y" => 'ى', "y" => 'ي',
62+
/[FNKaui~o^#`_:;,\.\!\-\+\%\]\[]/ => ''
63+
}
64+
bw.chars.map { |c|
65+
map_key = mapping.keys.find { |k| k.is_a?(Regexp) ? k.match?(c) : k == c }
66+
mapping[map_key] || c
67+
}.join
68+
end
69+
70+
def remove_diacritics(str)
71+
str.gsub(/[\u064B-\u065F\u0670]/, '').gsub('ـ', '')
72+
end
73+
end

0 commit comments

Comments
 (0)