add locale: metanorma/bipm-si-brochure#191

opoudjis · opoudjis · commit b1f5d15eeb76 · 2022-09-25T17:39:14.000+10:00
diff --git a/lib/isodoc/i18n.rb b/lib/isodoc/i18n.rb
@@ -54,9 +54,10 @@ def set(key, val)
       @labels[key] = val
     end
 
-    def initialize(lang, script, i18nyaml: nil, i18nhash: nil)
+    def initialize(lang, script, locale: nil, i18nyaml: nil, i18nhash: nil)
       @lang = lang
       @script = script
+      @locale = locale
       y = load_yaml(lang, script, i18nyaml, i18nhash)
       @labels = y
       @labels["language"] = @lang
@@ -66,17 +67,16 @@ def initialize(lang, script, i18nyaml: nil, i18nhash: nil)
       end
     end
 
-    def self.l10n(text, lang = @lang, script = @script)
-      l10n(text, lang, script)
+    def self.l10n(text, lang = @lang, script = @script, locale = @locale)
+      l10n(text, lang, script, locale)
     end
 
-    # TODO: move to localization file
     # function localising spaces and punctuation.
     # Not clear if period needs to be localised for zh
-    def l10n(text, lang = @lang, script = @script)
-      if lang == "zh" && script == "Hans" then l10n_zh(text)
-      else bidiwrap(text, lang, script)
-      end
+    def l10n(text, lang = @lang, script = @script, locale = @locale)
+      lang == "zh" && script == "Hans" and text = l10n_zh(text)
+      lang == "fr" && text = l10n_fr(text, locale || "FR")
+      bidiwrap(text, lang, script)
     end
 
     def bidiwrap(text, lang, script)
@@ -107,23 +107,51 @@ def l10n_zh(text)
       xml.to_xml.gsub(/<b>/, "").gsub("</b>", "").gsub(/<\?[^>]+>/, "")
     end
 
+    def l10n_fr(text, locale)
+      xml = Nokogiri::HTML::DocumentFragment.parse(text)
+      xml.traverse do |n|
+        next unless n.text?
+
+        n.replace(cleanup_entities(l10n_fr1(n.text, locale), is_xml: false))
+      end
+      xml.to_xml
+    end
+
     ZH_CHAR = "\\p{Han}|\\p{In CJK Symbols And Punctuation}|"\
               "\\p{In Halfwidth And Fullwidth Forms}".freeze
 
     # note: we can't differentiate comma from enumeration comma 、
     def l10_zh1(text)
+      l10n_zh_remove_space(l10n_zh_punct(text))
+    end
+
+    def l10n_zh_punct(text)
       [":：", ",，", ".。", ")）", "]】", ":：", ";；", "?？", "!！"].each do |m|
         text = text.gsub(/(?<=#{ZH_CHAR})#{Regexp.quote m[0]}/, m[1])
+        text = text.gsub(/^#{Regexp.quote m[0]}/, m[1])
       end
       ["(（", "[【"].each do |m|
         text = text.gsub(/#{Regexp.quote m[0]}(?=#{ZH_CHAR})/, m[1])
       end
+      text
+    end
+
+    def l10n_zh_remove_space(text)
       text.gsub(/(?<=#{ZH_CHAR}) (?=#{ZH_CHAR})/o, "")
         .gsub(/(?<=\d) (?=#{ZH_CHAR})/o, "")
         .gsub(/(?<=#{ZH_CHAR}) (?=\d)/o, "")
         .gsub(/(?<=#{ZH_CHAR}) (?=[A-Za-z](#{ZH_CHAR}|$))/o, "")
     end
 
+    def l10n_fr1(text, locale)
+      text = text.gsub(/(?<=\p{Alnum})([»›;?!])/, "\u202f\\1")
+      text = text.gsub(/^([»›;?!])/, "\u202f\\1")
+      text = text.gsub(/([«‹])/, "\\1\u202f")
+      colonsp = locale == "CH" ? "\u202f" : "\u00a0"
+      text = text.gsub(/(?<=\p{Alnum})(:)/, "#{colonsp}\\1")
+      text.gsub(/^(:)/, "#{colonsp}\\1")
+    end
+
     def boolean_conj(list, conn)
       case list.size
       when 0 then ""
diff --git a/spec/isodoc/base_spec.rb b/spec/isodoc/base_spec.rb
@@ -38,7 +38,8 @@
   end
 
   it "loads language hash overrides" do
-    c = IsoDoc::I18n.new("en", "Latn", i18nhash: YAML.load_file("spec/assets/new.yaml"))
+    c = IsoDoc::I18n.new("en", "Latn",
+                         i18nhash: YAML.load_file("spec/assets/new.yaml"))
     expect(c.text).to eq "text2"
     expect(c.at).to eq "at"
     expect(c.hash.to_s).to be_equivalent_to '{"key1"=>"val1", "key2"=>"val2"}'
@@ -85,6 +86,25 @@
       .to be_equivalent_to "&#x61c;Code (hello, world.)&#x61c;"
   end
 
+  it "does French localisation" do
+    e = HTMLEntities.new
+    c = IsoDoc::I18n.new("fr", "Latn")
+    expect(e.encode(c.l10n("Code; «code» and: code!"), :hexadecimal))
+      .to be_equivalent_to "Code&#x202f;; &#xab;&#x202f;code&#x202f;&#xbb; "\
+                           "and&#xa0;: code&#x202f;!"
+    expect(e.encode(c.l10n("Code; &#xab;code&#xbb; and: code!"), :hexadecimal))
+      .to be_equivalent_to "Code&#x202f;; &#xab;&#x202f;code&#x202f;&#xbb; "\
+                           "and&#xa0;: code&#x202f;!"
+    c = IsoDoc::I18n.new("fr", "Latn", locale: "FR")
+    expect(e.encode(c.l10n("Code; «code» and: code!"), :hexadecimal))
+      .to be_equivalent_to "Code&#x202f;; &#xab;&#x202f;code&#x202f;&#xbb; "\
+                           "and&#xa0;: code&#x202f;!"
+    c = IsoDoc::I18n.new("fr", "Latn", locale: "CH")
+    expect(e.encode(c.l10n("Code; «code» and: code!"), :hexadecimal))
+      .to be_equivalent_to "Code&#x202f;; &#xab;&#x202f;code&#x202f;&#xbb; "\
+                           "and&#x202f;: code&#x202f;!"
+  end
+
   it "does boolean conjunctions" do
     c = IsoDoc::I18n.new("en", "Latn")
     expect(c.boolean_conj([], "and")).to eq ""