Skip to content

Commit 452a21a

Browse files
Under-represented Languages NLP Teamcopybara-github
authored andcommitted
No public description
PiperOrigin-RevId: 789746190
1 parent 64390d1 commit 452a21a

File tree

3 files changed

+130
-4
lines changed

3 files changed

+130
-4
lines changed

linguameta/data/he.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1328,6 +1328,11 @@
13281328
{
13291329
"name": "Ivrit",
13301330
"bcp_47_code": "he",
1331+
"source": "GOOGLE_RESEARCH"
1332+
},
1333+
{
1334+
"name": "עִבְרִית",
1335+
"bcp_47_code": "he",
13311336
"is_canonical": true,
13321337
"source": "GOOGLE_RESEARCH"
13331338
}

linguameta/data/zh.json

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"wuu",
2020
"yue"
2121
],
22+
"total_population": 1288700000,
2223
"language_scope": {
2324
"scope": "MACROLANGUAGE",
2425
"source": "ISO_639"
@@ -1435,11 +1436,131 @@
14351436
"is_canonical": true,
14361437
"source": "CLDR"
14371438
},
1439+
{
1440+
"name": "中文",
1441+
"bcp_47_code": "zh",
1442+
"is_canonical": true,
1443+
"source": "CLDR"
1444+
},
14381445
{
14391446
"name": "isi-Chinese",
14401447
"bcp_47_code": "zu",
14411448
"is_canonical": true,
14421449
"source": "CLDR"
14431450
}
1451+
],
1452+
"language_script_locale": [
1453+
{
1454+
"script": {
1455+
"iso_15924_code": "hans",
1456+
"is_canonical": true,
1457+
"source": "GOOGLE_RESEARCH"
1458+
},
1459+
"locale": {
1460+
"iso_3166_code": "cn",
1461+
"source": "GOOGLE_RESEARCH"
1462+
},
1463+
"speaker_data": {
1464+
"number_of_speakers": 1255000000,
1465+
"source": "CLDR"
1466+
},
1467+
"official_status": {
1468+
"has_de_facto_official_status": true,
1469+
"source": "GOOGLE_RESEARCH"
1470+
},
1471+
"geolocation": {
1472+
"latitude": 35.33,
1473+
"longitude": 103.23,
1474+
"source": "GOOGLE_RESEARCH"
1475+
}
1476+
},
1477+
{
1478+
"script": {
1479+
"iso_15924_code": "hans",
1480+
"is_canonical": true,
1481+
"source": "GOOGLE_RESEARCH"
1482+
},
1483+
"locale": {
1484+
"iso_3166_code": "sg",
1485+
"source": "GOOGLE_RESEARCH"
1486+
},
1487+
"speaker_data": {
1488+
"number_of_speakers": 4800000,
1489+
"source": "CLDR"
1490+
},
1491+
"official_status": {
1492+
"has_official_status": true,
1493+
"source": "GOOGLE_RESEARCH"
1494+
}
1495+
},
1496+
{
1497+
"script": {
1498+
"iso_15924_code": "hant",
1499+
"source": "GOOGLE_RESEARCH"
1500+
},
1501+
"locale": {
1502+
"iso_3166_code": "cn",
1503+
"source": "GOOGLE_RESEARCH"
1504+
},
1505+
"speaker_data": {
1506+
"number_of_speakers": 1255000000,
1507+
"source": "CLDR"
1508+
}
1509+
},
1510+
{
1511+
"script": {
1512+
"iso_15924_code": "hant",
1513+
"is_canonical": true,
1514+
"source": "GOOGLE_RESEARCH"
1515+
},
1516+
"locale": {
1517+
"iso_3166_code": "hk",
1518+
"source": "GOOGLE_RESEARCH"
1519+
},
1520+
"speaker_data": {
1521+
"number_of_speakers": 6900000,
1522+
"source": "CLDR"
1523+
}
1524+
},
1525+
{
1526+
"script": {
1527+
"iso_15924_code": "hant",
1528+
"is_canonical": true,
1529+
"source": "GOOGLE_RESEARCH"
1530+
},
1531+
"locale": {
1532+
"iso_3166_code": "tw",
1533+
"source": "GOOGLE_RESEARCH"
1534+
},
1535+
"speaker_data": {
1536+
"number_of_speakers": 22000000,
1537+
"source": "CLDR"
1538+
},
1539+
"official_status": {
1540+
"has_official_status": true,
1541+
"source": "CLDR"
1542+
},
1543+
"geolocation": {
1544+
"latitude": 25.066668,
1545+
"longitude": 121.51667,
1546+
"source": "GOOGLE_RESEARCH"
1547+
}
1548+
},
1549+
{
1550+
"script": {
1551+
"iso_15924_code": "latn",
1552+
"is_canonical": false,
1553+
"is_for_transliteration": true,
1554+
"source": "GOOGLE_RESEARCH"
1555+
},
1556+
"locale": {
1557+
"iso_3166_code": "cn",
1558+
"source": "GOOGLE_RESEARCH"
1559+
},
1560+
"speaker_data": {
1561+
"number_of_speakers": 1255000000,
1562+
"source": "CLDR"
1563+
}
1564+
}
14441565
]
14451566
}

linguameta/linguameta.tsv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -337,7 +337,7 @@ aqp aqp Atakapa Latn US Extinct False atak1252 Q10975683 extinct language
337337
aqr aqr Arhâ 170 Latn NC Severely endangered False arha1237 Q4790085
338338
aqt aqt Angaité 1000 Latn PY Severely endangered False anga1316 Q15736037
339339
aqz aqz Akuntsu 4 Latn BR Critically endangered False akun1241 Q4701960
340-
ar ara Arabic العربية 346277000 Arab AE, BH, DJ, DZ, EG, EH, ER, IL, IQ, JO, KM, KW, LB, LY, MA, MR, OM, PS, QA, SA, SD, SO, SY, TD, TN, XA, YE Official [AE], Official [BH], Official [DJ], Official [DZ], Official [EG], Official [EH], Official [IL], Official [IQ], Official [JO], Official [KM], Official [KW], Official [LB], Official [LY], Official [MA], Official [MR], Official [OM], Official [PS], Official [QA], Official [SA], Official [SD], Official [SO], Official [SY], Official [TD], Official [TN], Official [YE] True aao, abh, abv, acm, acq, acw, acx, acy, adf, aeb, aec, afb, ajp, apc, apd, arb, arq, ars, ary, arz, auz, avl, ayh, ayl, ayn, ayp, pga, shu, ssh
340+
ar ara Arabic العربية 346277000 Arab AE, BH, DJ, DZ, EG, EH, ER, IL, IQ, JO, KM, KW, LB, LY, MA, MR, OM, PS, QA, SA, SD, SO, SY, TD, TN, XA, YE Official [AE], Official [BH], Official [DJ], Official [DZ], Official [EG], Official [EH], Official [IL], Official [IQ], Official [JO], Official [KM], Official [KW], Official [LB], Official [LY], Official [MA], Official [MR], Official [OM], Official [PS], Official [QA], Official [SA], Official [SD], Official [SO], Official [SY], Official [TD], Official [TN], Official [YE] Not endangered True aao, abh, abv, acm, acq, acw, acx, acy, adf, aeb, aec, afb, ajp, apc, apd, arb, arq, ars, ary, arz, auz, avl, ayh, ayl, ayn, ayp, pga, shu, ssh
341341
arb arb Standard Arabic Not endangered False ar stan1318 Q56467 the standardized and literary variety of Arabic used in writing and in most formal speech
342342
ard ard Arabana 31 Latn AU False arab1267 Q3507959
343343
are are Western Arrarnta 4500 Latn AU Definitely endangered False west2441 Q12645549 Australian Aboriginal language
@@ -1828,7 +1828,7 @@ eya eya Eyak Latn US Extinct False eyak1241 Q27480 indigenous language of
18281828
eyo eyo Keiyo Elgeyo 310000 Latn KE Not endangered False kln keiy1238 Q56856
18291829
eza eza Ezaa Ezza 590000 Latn NG Not endangered False ezaa1238 Q11921436 Igbo language of Nigeria
18301830
eze eze Uzekwe 5000 Latn NG Not endangered False uzek1238 Q3502244
1831-
fa fas Persian فارسی 82000000 Arab, Latn AF, IR Official [AF], Official [IR] True pes, prs per
1831+
fa fas Persian فارسی 82000000 Arab, Latn AF, IR Official [AF], Official [IR] Not endangered True pes, prs per
18321832
faa faa Fasu 1200 Latn PG Definitely endangered False fasu1242 Q3446687
18331833
fab fab Fa D'Ambu Latn GQ Not endangered False fada1250 Q34992
18341834
fad fad Wagi 3400 Latn PG Vulnerable False wagi1249 Q7959569
@@ -2271,7 +2271,7 @@ hca hca Andaman Creole Hindi 10000 IN Not endangered False anda1280 Q7599
22712271
hch hch Huichol 45000 Latn MX Vulnerable False huic1243 Q35575 indigenous language of Mexico
22722272
hdn hdn Northern Haida 14 Latn CA Critically endangered False hai nort2938 Q20054484 dialect of Haida, a Canadian/Alaskan indigenous language isolate
22732273
hdy hdy Hadiyya ሃድያኛ 250000 Ethi, Latn ET Not endangered False hadi1240 Q56613 language spoken in Ethiopia
2274-
he heb Hebrew Ivrit 8700000 Hebr, Latn IL Official [IL] Not endangered False iw hebr1245 Q8141 standard form of the revived Hebrew language spoken today mainly in Israel
2274+
he heb Hebrew עִבְרִית 8700000 Hebr, Latn IL Official [IL] Not endangered False iw hebr1245 Q8141 standard form of the revived Hebrew language spoken today mainly in Israel
22752275
hea hea Northern Qiandong Miao Hmub 1200000 Latn CN Not endangered False hmn nort2747 Q3138832
22762276
hed hed Herdé Latn CM, TD Not endangered False herd1236 Q56253
22772277
heg heg Helong 14000 Latn ID Vulnerable False helo1243 Q35432 language in Nusa Tenggara
@@ -7382,7 +7382,7 @@ zgh zgh Standard Moroccan Tamazight ⵜⴰⵎⴰⵣⵉⵖⵜ 7800000 Latn, Tfng
73827382
zgm zgm Minz Zhuang 180000 Hani CN Not endangered False za minz1236 Q6862618
73837383
zgn zgn Guibian Zhuang 1000000 CN Not endangered False za guib1244 Q17651536
73847384
zgr zgr Magori 100 Latn PG Definitely endangered False mago1248 Q3277370
7385-
zh zho Chinese True cdo, cjy, cmn, cnp, cpx, csp, czh, czo, gan, hak, hsn, mnp, nan, wuu, yue chi
7385+
zh zho Chinese 中文 1288700000 Hans, Hant, Latn CN, HK, SG, TW Official [TW] Not endangered True cdo, cjy, cmn, cnp, cpx, csp, czh, czo, gan, hak, hsn, mnp, nan, wuu, yue chi
73867386
zhb zhb Zhaba CN Definitely endangered False zhab1238 Q56334
73877387
zhd zhd Dai Zhuang 100000 Hani, Latn CN Vulnerable False za daiz1235 Q5209052
73887388
zhi zhi Zhire Latn NG Not endangered False zhir1238 Q3914910

0 commit comments

Comments
 (0)