Skip to content

Commit 37f2d2b

Browse files
authored
Merge pull request #23 from IndicoDataSolutions/mawelborn/feature-parity
Python Toolkit v7.2.3 Feature Parity for Results and EtlOutputs
2 parents 4b3d8af + 16568e4 commit 37f2d2b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+1626
-355
lines changed
Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,44 @@
11
using IndicoToolkit.EtlOutputs;
22
using Xunit;
33

4-
namespace IndicoToolkit.Tests;
4+
namespace IndicoToolkit.Tests.EtlOutputs;
55

66

7-
public class EtlOutputTests
7+
public class FileTests
88
{
9-
// The base directory will be IndicoToolkit.Tests/bin/Debug/net8.0/
10-
private string SamplesFolder = Path.Combine(
9+
// The base directory will be IndicoToolkit.Tests/bin/Debug/net*/
10+
private static readonly string SamplesFolder = Path.Combine(
1111
AppDomain.CurrentDomain.BaseDirectory,
1212
"..", "..", "..", "EtlOutputs", "Samples"
1313
);
1414

15-
public string ReadUri(string uri)
15+
private static string ReadUri(string uri)
1616
{
1717
var storageFolderPath = uri.Split("/storage/submission/").Last();
1818
var filePath = Path.Combine(SamplesFolder, storageFolderPath);
1919
return File.ReadAllText(filePath);
2020
}
2121

22-
public async Task<string> ReadUriAsync(string uri)
22+
private static async Task<string> ReadUriAsync(string uri)
2323
{
2424
var storageFolderPath = uri.Split("/storage/submission/").Last();
2525
var filePath = Path.Combine(SamplesFolder, storageFolderPath);
26-
return File.ReadAllText(filePath);
26+
return await File.ReadAllTextAsync(filePath);
2727
}
2828

2929
[Theory]
3030
[InlineData("4723/111922/110237/etl_output.json")]
3131
[InlineData("4724/111923/110238/etl_output.json")]
3232
[InlineData("4725/111924/110239/etl_output.json")]
33-
public void TestSampleFiles(string filename)
33+
public void TestFileLoad(string filename)
3434
{
3535
var etlOutput = EtlOutput.Load(filename, reader: ReadUri);
36-
var pageCount = etlOutput.TextOnPage.Count;
36+
var pageCount = etlOutput.TextOnPage.Length;
3737
var charCount = etlOutput.Text.Length;
38-
var tokenCount = etlOutput.Tokens.Count;
39-
var tableCount = etlOutput.Tables.Count;
38+
var tokenCount = etlOutput.Tokens.Length;
39+
var tableCount = etlOutput.Tables.Length;
4040

41-
Assert.Equal(pageCount, 2);
41+
Assert.Equal(2, pageCount);
4242
Assert.InRange(charCount, 2090, 2093);
4343
Assert.InRange(tokenCount, 326, 331);
4444
Assert.True(tableCount == 0 || tableCount == 4);
@@ -48,15 +48,15 @@ public void TestSampleFiles(string filename)
4848
[InlineData("4723/111922/110237/etl_output.json")]
4949
[InlineData("4724/111923/110238/etl_output.json")]
5050
[InlineData("4725/111924/110239/etl_output.json")]
51-
public async Task TestSampleFilesAsync(string filename)
51+
public async Task TestFileLoadAsync(string filename)
5252
{
5353
var etlOutput = await EtlOutput.LoadAsync(filename, reader: ReadUriAsync);
54-
var pageCount = etlOutput.TextOnPage.Count;
54+
var pageCount = etlOutput.TextOnPage.Length;
5555
var charCount = etlOutput.Text.Length;
56-
var tokenCount = etlOutput.Tokens.Count;
57-
var tableCount = etlOutput.Tables.Count;
56+
var tokenCount = etlOutput.Tokens.Length;
57+
var tableCount = etlOutput.Tables.Length;
5858

59-
Assert.Equal(pageCount, 2);
59+
Assert.Equal(2, pageCount);
6060
Assert.InRange(charCount, 2090, 2093);
6161
Assert.InRange(tokenCount, 326, 331);
6262
Assert.True(tableCount == 0 || tableCount == 4);
@@ -66,35 +66,35 @@ public async Task TestSampleFilesAsync(string filename)
6666
[InlineData("4723/111922/110237/etl_output.json")]
6767
[InlineData("4724/111923/110238/etl_output.json")]
6868
[InlineData("4725/111924/110239/etl_output.json")]
69-
public void TestSampleFilesDisableValues(string filename)
69+
public void TestFileLoadDisableValues(string filename)
7070
{
7171
var etlOutput = EtlOutput.Load(filename, reader: ReadUri, text: false, tokens: false, tables: false);
72-
var pageCount = etlOutput.TextOnPage.Count;
72+
var pageCount = etlOutput.TextOnPage.Length;
7373
var charCount = etlOutput.Text.Length;
74-
var tokenCount = etlOutput.Tokens.Count;
75-
var tableCount = etlOutput.Tables.Count;
74+
var tokenCount = etlOutput.Tokens.Length;
75+
var tableCount = etlOutput.Tables.Length;
7676

77-
Assert.Equal(pageCount, 0);
78-
Assert.Equal(charCount, 0);
79-
Assert.Equal(tokenCount, 0);
80-
Assert.Equal(tableCount, 0);
77+
Assert.Equal(0, pageCount);
78+
Assert.Equal(0, charCount);
79+
Assert.Equal(0, tokenCount);
80+
Assert.Equal(0, tableCount);
8181
}
8282

8383
[Theory]
8484
[InlineData("4723/111922/110237/etl_output.json")]
8585
[InlineData("4724/111923/110238/etl_output.json")]
8686
[InlineData("4725/111924/110239/etl_output.json")]
87-
public async Task TestSampleFilesDisableValuesAsync(string filename)
87+
public async Task TestFileLoadDisableValuesAsync(string filename)
8888
{
8989
var etlOutput = await EtlOutput.LoadAsync(filename, reader: ReadUriAsync, text: false, tokens: false, tables: false);
90-
var pageCount = etlOutput.TextOnPage.Count;
90+
var pageCount = etlOutput.TextOnPage.Length;
9191
var charCount = etlOutput.Text.Length;
92-
var tokenCount = etlOutput.Tokens.Count;
93-
var tableCount = etlOutput.Tables.Count;
92+
var tokenCount = etlOutput.Tokens.Length;
93+
var tableCount = etlOutput.Tables.Length;
9494

95-
Assert.Equal(pageCount, 0);
96-
Assert.Equal(charCount, 0);
97-
Assert.Equal(tokenCount, 0);
98-
Assert.Equal(tableCount, 0);
95+
Assert.Equal(0, pageCount);
96+
Assert.Equal(0, charCount);
97+
Assert.Equal(0, tokenCount);
98+
Assert.Equal(0, tableCount);
9999
}
100100
}
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
using IndicoToolkit.EtlOutputs;
2+
using Xunit;
3+
4+
namespace IndicoToolkit.Tests.EtlOutputs;
5+
6+
7+
public class RowspanColspanTests
8+
{
9+
// The base directory will be IndicoToolkit.Tests/bin/Debug/net*/
10+
private static readonly string SamplesFolder = Path.Combine(
11+
AppDomain.CurrentDomain.BaseDirectory,
12+
"..", "..", "..", "EtlOutputs", "Samples"
13+
);
14+
private static readonly string EtlOutputFile = Path.Combine(
15+
SamplesFolder,
16+
"4725", "112731", "112257", "etl_output_rs_cs.json"
17+
);
18+
19+
private static string ReadUri(string uri)
20+
{
21+
var storageFolderPath = uri.Split("/storage/submission/").Last();
22+
var filePath = Path.Combine(SamplesFolder, storageFolderPath);
23+
return File.ReadAllText(filePath);
24+
}
25+
26+
/*
27+
The table for these tests looks like:
28+
29+
| Alfa | Bravo | Charlie | Delta |
30+
|----------|-------------------|---------|
31+
| | Foxtrot | Golf |
32+
| Echo |-------------------|---------|
33+
| | Hotel | India | Juliett |
34+
|----------|-------------------|---------|
35+
| Kilo | | Mike |
36+
|----------| Lima |---------|
37+
| November | | Oscar |
38+
----------------------------------------
39+
*/
40+
private static EtlOutput EtlOutput => EtlOutput.Load(EtlOutputFile, reader: ReadUri);
41+
private static Table Table => EtlOutput.Tables.First();
42+
43+
[Fact]
44+
public void TestCells()
45+
{
46+
var cells = Table.Cells.Select(cell => cell.Text);
47+
var expectedCells = new List<string> {
48+
"Alfa", "Bravo", "Charlie", "Delta",
49+
"Echo", "Foxtrot", "Golf",
50+
"Hotel", "India", "Juliett",
51+
"Kilo", "Lima", "Mike",
52+
"November", "Oscar",
53+
};
54+
55+
Assert.Equal(expectedCells, cells);
56+
}
57+
58+
[Fact]
59+
public void TestRows()
60+
{
61+
var rows = Table.Rows.Select(row => row.Select(cell => cell.Text).ToList()).ToList();
62+
var expectedRows = new List<List<string>> {
63+
new() { "Alfa", "Bravo", "Charlie", "Delta" },
64+
new() { "Echo", "Foxtrot", "Foxtrot", "Golf" },
65+
new() { "Echo", "Hotel", "India", "Juliett" },
66+
new() { "Kilo", "Lima", "Lima", "Mike" },
67+
new() { "November", "Lima", "Lima", "Oscar" },
68+
};
69+
70+
Assert.Equal(expectedRows, rows);
71+
}
72+
73+
[Fact]
74+
public void TestColumns()
75+
{
76+
var columns = Table.Columns.Select(column => column.Select(cell => cell.Text).ToList()).ToList();
77+
var expectedColumns = new List<List<string>> {
78+
new() {
79+
"Alfa",
80+
"Echo",
81+
"Echo",
82+
"Kilo",
83+
"November",
84+
},
85+
new() {
86+
"Bravo",
87+
"Foxtrot",
88+
"Hotel",
89+
"Lima",
90+
"Lima",
91+
},
92+
new() {
93+
"Charlie",
94+
"Foxtrot",
95+
"India",
96+
"Lima",
97+
"Lima",
98+
},
99+
new() {
100+
"Delta",
101+
"Golf",
102+
"Juliett",
103+
"Mike",
104+
"Oscar",
105+
},
106+
};
107+
108+
Assert.Equal(expectedColumns, columns);
109+
}
110+
111+
[Theory]
112+
[InlineData(0, 25, 29, "Alfa")]
113+
[InlineData(0, 30, 35, "Bravo")]
114+
[InlineData(0, 36, 43, "Charlie")]
115+
[InlineData(0, 44, 49, "Delta")]
116+
[InlineData(0, 50, 54, "Echo")]
117+
[InlineData(0, 55, 62, "Foxtrot")]
118+
[InlineData(0, 64, 68, "Golf")]
119+
[InlineData(0, 70, 75, "Hotel")]
120+
[InlineData(0, 76, 81, "India")]
121+
[InlineData(0, 82, 89, "Juliett")]
122+
[InlineData(0, 90, 94, "Kilo")]
123+
[InlineData(0, 111, 115, "Lima")]
124+
[InlineData(0, 97, 101, "Mike")]
125+
[InlineData(0, 102, 110, "November")]
126+
[InlineData(0, 117, 122, "Oscar")]
127+
public void TestTableCellFor(int page, int start, int end, string expectedText)
128+
{
129+
var span = new Span(page, start, end);
130+
var tableCell = EtlOutput.TableCellsFor(span).First();
131+
132+
Assert.Equal(expectedText, tableCell.Cell.Text);
133+
}
134+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"email_metadata":{},"full_text":"indico-file:///storage/submission/4725/112731/112257/full_text.txt","num_pages":1,"pages":[{"blocks":"indico-file:///storage/submission/4725/112731/112257/page_0_blocks.json","characters":"indico-file:///storage/submission/4725/112731/112257/page_0_chars.json","doc_offset":{"end":151,"start":0},"dpi":{"dpix":300,"dpiy":300},"filename":"Rowspan Colspan Sample.png","image":"indico-file:///storage/submission/4725/112731/112257/original_page_0.png","page_info":"indico-file:///storage/submission/4725/112731/112257/page_info_0.json","page_num":0,"size":{"height":3883,"width":2556},"tables":"indico-file:///storage/submission/4725/112731/112257/tables_0.json","text":"indico-file:///storage/submission/4725/112731/112257/page_0_text.txt","thumbnail":"indico-file:///storage/submission/4725/112731/112257/original_thumbnail_0.png","tokens":"indico-file:///storage/submission/4725/112731/112257/page_0_tokens.json"}]}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Rowspan / Colspan Sample
2+
Alfa Bravo Charlie Delta
3+
Echo Foxtrot Golf
4+
Hotel India Juliett
5+
Kilo Mike
6+
November Lima Oscar
7+
formatted by Markdeep 1.18_d
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"block_offset":{"end":7,"start":0},"doc_offset":{"end":7,"start":0},"page_num":0,"page_offset":{"end":7,"start":0},"position":{"bbBot":287,"bbLeft":561,"bbRight":1015,"bbTop":192,"bottom":287,"left":561,"right":1015,"top":192},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Rowspan"},{"block_offset":{"end":9,"start":8},"doc_offset":{"end":9,"start":8},"page_num":0,"page_offset":{"end":9,"start":8},"position":{"bbBot":290,"bbLeft":1070,"bbRight":1123,"bbTop":187,"bottom":290,"left":1070,"right":1123,"top":187},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"/"},{"block_offset":{"end":17,"start":10},"doc_offset":{"end":17,"start":10},"page_num":0,"page_offset":{"end":17,"start":10},"position":{"bbBot":291,"bbLeft":1158,"bbRight":1547,"bbTop":185,"bottom":291,"left":1158,"right":1547,"top":185},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Colspan"},{"block_offset":{"end":24,"start":18},"doc_offset":{"end":24,"start":18},"page_num":0,"page_offset":{"end":24,"start":18},"position":{"bbBot":288,"bbLeft":1601,"bbRight":1978,"bbTop":184,"bottom":288,"left":1601,"right":1978,"top":184},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Sample"},{"block_offset":{"end":4,"start":0},"doc_offset":{"end":29,"start":25},"page_num":0,"page_offset":{"end":29,"start":25},"position":{"bbBot":520,"bbLeft":772,"bbRight":858,"bbTop":477,"bottom":520,"left":772,"right":858,"top":477},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Alfa"},{"block_offset":{"end":10,"start":5},"doc_offset":{"end":35,"start":30},"page_num":0,"page_offset":{"end":35,"start":30},"position":{"bbBot":520,"bbLeft":1109,"bbRight":1242,"bbTop":476,"bottom":520,"left":1109,"right":1242,"top":476},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Bravo"},{"block_offset":{"end":18,"start":11},"doc_offset":{"end":43,"start":36},"page_num":0,"page_offset":{"end":43,"start":36},"position":{"bbBot":521,"bbLeft":1358,"bbRight":1523,"bbTop":475,"bottom":521,"left":1358,"right":1523,"top":475},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Charlie"},{"block_offset":{"end":24,"start":19},"doc_offset":{"end":49,"start":44},"page_num":0,"page_offset":{"end":49,"start":44},"position":{"bbBot":518,"bbLeft":1635,"bbRight":1757,"bbTop":473,"bottom":518,"left":1635,"right":1757,"top":473},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Delta"},{"block_offset":{"end":29,"start":25},"doc_offset":{"end":54,"start":50},"page_num":0,"page_offset":{"end":54,"start":50},"position":{"bbBot":697,"bbLeft":767,"bbRight":875,"bbTop":652,"bottom":697,"left":767,"right":875,"top":652},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Echo"},{"block_offset":{"end":37,"start":30},"doc_offset":{"end":62,"start":55},"page_num":0,"page_offset":{"end":62,"start":55},"position":{"bbBot":648,"bbLeft":1106,"bbRight":1269,"bbTop":592,"bottom":648,"left":1106,"right":1269,"top":592},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Foxtrot"},{"block_offset":{"end":43,"start":39},"doc_offset":{"end":68,"start":64},"page_num":0,"page_offset":{"end":68,"start":64},"position":{"bbBot":640,"bbLeft":1636,"bbRight":1727,"bbTop":597,"bottom":640,"left":1636,"right":1727,"top":597},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Golf"},{"block_offset":{"end":50,"start":45},"doc_offset":{"end":75,"start":70},"page_num":0,"page_offset":{"end":75,"start":70},"position":{"bbBot":754,"bbLeft":1106,"bbRight":1231,"bbTop":698,"bottom":754,"left":1106,"right":1231,"top":698},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Hotel"},{"block_offset":{"end":56,"start":51},"doc_offset":{"end":81,"start":76},"page_num":0,"page_offset":{"end":81,"start":76},"position":{"bbBot":748,"bbLeft":1355,"bbRight":1465,"bbTop":706,"bottom":748,"left":1355,"right":1465,"top":706},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"India"},{"block_offset":{"end":64,"start":57},"doc_offset":{"end":89,"start":82},"page_num":0,"page_offset":{"end":89,"start":82},"position":{"bbBot":749,"bbLeft":1636,"bbRight":1771,"bbTop":703,"bottom":749,"left":1636,"right":1771,"top":703},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Juliett"},{"block_offset":{"end":69,"start":65},"doc_offset":{"end":94,"start":90},"page_num":0,"page_offset":{"end":94,"start":90},"position":{"bbBot":867,"bbLeft":765,"bbRight":852,"bbTop":811,"bottom":867,"left":765,"right":852,"top":811},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Kilo"},{"block_offset":{"end":76,"start":72},"doc_offset":{"end":101,"start":97},"page_num":0,"page_offset":{"end":101,"start":97},"position":{"bbBot":857,"bbLeft":1633,"bbRight":1738,"bbTop":813,"bottom":857,"left":1633,"right":1738,"top":813},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Mike"},{"block_offset":{"end":85,"start":77},"doc_offset":{"end":110,"start":102},"page_num":0,"page_offset":{"end":110,"start":102},"position":{"bbBot":968,"bbLeft":767,"bbRight":1001,"bbTop":923,"bottom":968,"left":767,"right":1001,"top":923},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"November"},{"block_offset":{"end":90,"start":86},"doc_offset":{"end":115,"start":111},"page_num":0,"page_offset":{"end":115,"start":111},"position":{"bbBot":914,"bbLeft":1108,"bbRight":1216,"bbTop":867,"bottom":914,"left":1108,"right":1216,"top":867},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Lima"},{"block_offset":{"end":97,"start":92},"doc_offset":{"end":122,"start":117},"page_num":0,"page_offset":{"end":122,"start":117},"position":{"bbBot":963,"bbLeft":1637,"bbRight":1772,"bbTop":922,"bottom":963,"left":1637,"right":1772,"top":922},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Oscar"},{"block_offset":{"end":9,"start":0},"doc_offset":{"end":132,"start":123},"page_num":0,"page_offset":{"end":132,"start":123},"position":{"bbBot":1393,"bbLeft":2007,"bbRight":2145,"bbTop":1352,"bottom":1393,"left":2007,"right":2145,"top":1352},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"formatted"},{"block_offset":{"end":12,"start":10},"doc_offset":{"end":135,"start":133},"page_num":0,"page_offset":{"end":135,"start":133},"position":{"bbBot":1390,"bbLeft":2154,"bbRight":2187,"bbTop":1352,"bottom":1390,"left":2154,"right":2187,"top":1352},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"by"},{"block_offset":{"end":21,"start":13},"doc_offset":{"end":144,"start":136},"page_num":0,"page_offset":{"end":144,"start":136},"position":{"bbBot":1390,"bbLeft":2196,"bbRight":2342,"bbTop":1351,"bottom":1390,"left":2196,"right":2342,"top":1351},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"Markdeep"},{"block_offset":{"end":28,"start":22},"doc_offset":{"end":151,"start":145},"page_num":0,"page_offset":{"end":151,"start":145},"position":{"bbBot":1390,"bbLeft":2354,"bbRight":2448,"bbTop":1351,"bottom":1390,"left":2354,"right":2448,"top":1351},"style":{"background_color":null,"bold":null,"font_face":null,"font_size":null,"handwriting":false,"italic":null,"text_color":null,"underlined":null},"text":"1.18_d"}]

0 commit comments

Comments
 (0)