Skip to content

Commit 9492737

Browse files
committed
#539 Add support for cp1364 and cp1388.
Conversion tables contributed by [@BenceBenedek](https://github.com/BenceBenedek).
1 parent aa7c898 commit 9492737

File tree

11 files changed

+3471
-63
lines changed

11 files changed

+3471
-63
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -754,12 +754,14 @@ The following code pages are supported:
754754
* `common_extended` - EBCDIC common characters with special characters extension
755755
* `cp037` - IBM EBCDIC US-Canada
756756
* `cp037_extended` - IBM EBCDIC US-Canada with special characters extension
757+
* `cp300` - IBM EBCDIC Japanese Extended (2 byte code page)
757758
* `cp838` - IBM EBCDIC Thailand
758759
* `cp870` - IBM EBCDIC Multilingual Latin-2
759760
* `cp875` - IBM EBCDIC Greek
760761
* `cp1025` - IBM EBCDIC Multilingual Cyrillic
761762
* `cp1047` - IBM EBCDIC Latin-1/Open System
762-
* `cp00300` - (experimental support) IBM EBCDIC Japanese (Katakana) Extended (2 byte code page)
763+
* `cp1364` - (experimental support) IBM EBCDIC Korean (2 byte code page)
764+
* `cp1388` - (experimental support) IBM EBCDIC Simplified Chinese (2 byte code page)
763765

764766
By default, Cobrix uses common EBCDIC code page which contains only basic latin characters, numbers, and punctuation.
765767
You can specify the code page to use for all string fields by setting the `ebcdic_code_page` option to one of the
@@ -1629,6 +1631,7 @@ A: Update hadoop dll to version 3.2.2 or newer.
16291631

16301632
## Changelog
16311633
- #### 2.6.5 (to be released soon)
1634+
- [#539](https://github.com/AbsaOSS/cobrix/issues/539) Fixed 'cp300', and added experimental support for 'cp1364' and 'cp1388' code pages (thanks [@BenceBenedek](https://github.com/BenceBenedek)).
16321635
- [#590](https://github.com/AbsaOSS/cobrix/issues/590) Changed from `.option("extended_metadata", true)` to `.option("metadata", "extended")` allowing other modes like 'basic' (default) and 'false' (disable metadata).
16331636
- [#593](https://github.com/AbsaOSS/cobrix/issues/593) Add option `.option("generate_record_bytes", true)` that adds a field containing raw bytes of each record decoded.
16341637

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage.scala

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,18 +35,24 @@ abstract class CodePage extends Serializable {
3535

3636
object CodePage extends Logging {
3737

38+
/**
39+
* Code page names from: https://www.ibm.com/docs/en/zos-connect/zosconnect/3.0?topic=properties-coded-character-set-identifiers
40+
*/
3841
def getCodePageByName(codePageName: String): CodePage = {
3942
codePageName match {
4043
case "common" => new CodePageCommon
4144
case "common_extended" => new CodePageCommonExt
4245
case "cp037" => new CodePage037
4346
case "cp037_extended" => new CodePage037Ext
47+
case "cp00300" => new CodePage300 // This is the same as cp300
48+
case "cp300" => new CodePage300
4449
case "cp838" => new CodePage838
4550
case "cp870" => new CodePage870
4651
case "cp875" => new CodePage875
4752
case "cp1025" => new CodePage1025
4853
case "cp1047" => new CodePage1047
49-
case "cp00300" => new CodePage00300
54+
case "cp1364" => new CodePage1364
55+
case "cp1388" => new CodePage1388
5056
case codePage => throw new IllegalArgumentException(s"The code page '$codePage' is not one of the builtin EBCDIC code pages.")
5157
}
5258
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.cobol.parser.encoding.codepage
18+
19+
import za.co.absa.cobrix.cobol.parser.encoding.codepage.TwoByteCodePage.createEbcdicToUnicodeTable
20+
21+
/**
22+
* EBCDIC code page CCSID-1364 (Korean).
23+
*/
24+
class CodePage1364 extends TwoByteCodePage(CodePage1364.ebcdicToAsciiMapping) {
25+
override def codePageShortName: String = "cp1364"
26+
}
27+
28+
object CodePage1364 {
29+
val ebcdicToAsciiMapping: Array[Char] = {
30+
/**
31+
* This is the EBCDIC Code Page 1364 contributed by https://github.com/BenceBenedek
32+
* https://www.ibm.com/docs/en/i/7.3?topic=reference-ccsid-values
33+
*/
34+
createEbcdicToUnicodeTable(TwoByteTables1364.mappingTableEbcdic1364(), TwoByteTables1364.mappingTableUnicode1364())
35+
}
36+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* Copyright 2018 ABSA Group Limited
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package za.co.absa.cobrix.cobol.parser.encoding.codepage
18+
19+
import za.co.absa.cobrix.cobol.parser.encoding.codepage.TwoByteCodePage.createEbcdicToUnicodeTable
20+
21+
/**
22+
* EBCDIC code page CCSID-1388 (Simplified Chinese)
23+
*/
24+
class CodePage1388 extends TwoByteCodePage(CodePage1388.ebcdicToAsciiMapping) {
25+
override def codePageShortName: String = "cp1388"
26+
}
27+
28+
object CodePage1388 {
29+
val ebcdicToAsciiMapping: Array[Char] = {
30+
/**
31+
* This is the EBCDIC Code Page 1388 contributed by https://github.com/BenceBenedek
32+
* https://www.ibm.com/docs/en/i/7.3?topic=reference-ccsid-values
33+
*/
34+
createEbcdicToUnicodeTable(TwoByteTables1388.mappingTableEbcdic1388(), TwoByteTables1388.mappingTableUnicode1388())
35+
}
36+
}

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage00300.scala renamed to cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/CodePage300.scala

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -16,37 +16,21 @@
1616

1717
package za.co.absa.cobrix.cobol.parser.encoding.codepage
1818

19+
import za.co.absa.cobrix.cobol.parser.encoding.codepage.TwoByteCodePage.createEbcdicToUnicodeTable
20+
1921
/**
2022
* EBCDIC code page 300 Japanese Latin Host Double-Byte.
2123
*/
22-
class CodePage00300 extends TwoByteCodePage(CodePage00300.ebcdicToAsciiMapping) {
23-
override def codePageShortName: String = "cp00300"
24+
class CodePage300 extends TwoByteCodePage(CodePage300.ebcdicToAsciiMapping) {
25+
override def codePageShortName: String = "cp300"
2426
}
2527

26-
object CodePage00300 {
28+
object CodePage300 {
2729
val ebcdicToAsciiMapping: Array[Char] = {
2830
/**
29-
* This is the EBCDIC Code Page 00300 contributed by https://github.com/BenceBenedek
31+
* This is the EBCDIC Code Page CCSID-00300 contributed by https://github.com/BenceBenedek
3032
* https://public.dhe.ibm.com/software/globalization/gcoc/attachments/CP00300.pdf
3133
*/
32-
val ebcdic2ascii: Array[Char] = {
33-
val directMapping = new Array[Char](65536)
34-
35-
val ebcdic300 = TwoByteTables.mappingTableEbcdic300()
36-
val unicode300 = TwoByteTables.mappingTableUnicode300()
37-
38-
var i = 0
39-
val len = ebcdic300.length
40-
while (i < len) {
41-
val unicode = unicode300(i)
42-
val ebcdic = ebcdic300(i)
43-
directMapping(ebcdic) = unicode.toChar
44-
i += 1
45-
}
46-
directMapping
47-
}
48-
49-
50-
ebcdic2ascii
34+
createEbcdicToUnicodeTable(TwoByteTables300.mappingTableEbcdic300(), TwoByteTables300.mappingTableUnicode300())
5135
}
5236
}

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/TwoByteCodePage.scala

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,19 @@ abstract class TwoByteCodePage(ebcdicToAsciiMapping: Array[Char]) extends CodePa
9191
buf.toString
9292
}
9393
}
94+
95+
object TwoByteCodePage {
96+
def createEbcdicToUnicodeTable(ebcdicTable: Array[Int], unicodeTable: Array[Int]): Array[Char] = {
97+
val directMapping = new Array[Char](65536)
98+
99+
var i = 0
100+
val len = ebcdicTable.length
101+
while (i < len) {
102+
val unicode = unicodeTable(i)
103+
val ebcdic = ebcdicTable(i)
104+
directMapping(ebcdic) = unicode.toChar
105+
i += 1
106+
}
107+
directMapping
108+
}
109+
}

cobol-parser/src/main/scala/za/co/absa/cobrix/cobol/parser/encoding/codepage/TwoByteTables.java

Lines changed: 0 additions & 37 deletions
This file was deleted.

0 commit comments

Comments
 (0)