Skip to content

Commit 5627e75

Browse files
authored
Merge pull request #8 from freud-digital/dev
merging class=ff paragraphs with previous paragraph, adapting tests
2 parents a9f718d + c8c05da commit 5627e75

File tree

6 files changed

+58
-15
lines changed

6 files changed

+58
-15
lines changed

.flake8

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ exclude =
77
build
88
dist
99
env
10+
venv

freud_api_crawler/fixtures/make_tei.xslt

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,51 @@
2323
####################
2424
-->
2525

26+
<xsl:template match="tei:body">
27+
<body>
28+
<div>
29+
<xsl:apply-templates/>
30+
</div>
31+
</body>
32+
</xsl:template>
33+
<xsl:template match="tei:div">
34+
<xsl:apply-templates/>
35+
</xsl:template>
36+
<xsl:template match="tei:div/tei:p[position() = last()]">
37+
<xsl:copy>
38+
<xsl:apply-templates/>
39+
<xsl:choose>
40+
<xsl:when test="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[@class='ff']">
41+
<xsl:copy-of select="parent::tei:div/following-sibling::tei:div[1]/child::tei:pb"/>
42+
<fw type="pageNum">
43+
<xsl:value-of select="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[./tei:span[@class='pagenumber']]/tei:span[@class='pagenumber']"/>
44+
</fw>
45+
<!--<xsl:copy-of select="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[./tei:span[@class='pagenumber']]/tei:span[@class='pagenumber']"/>-->
46+
<xsl:for-each select="parent::tei:div/following-sibling::tei:div[1]/child::tei:p[@class='ff']">
47+
<xsl:apply-templates/>
48+
</xsl:for-each>
49+
</xsl:when>
50+
</xsl:choose>
51+
</xsl:copy>
52+
</xsl:template>
53+
<xsl:template match="tei:pb[following-sibling::tei:p[@class='ff']]">
54+
55+
</xsl:template>
56+
<xsl:template match="tei:p[@class='ff']">
57+
58+
</xsl:template>
59+
<xsl:template match="tei:space">
60+
<xsl:text>&#x00A0;</xsl:text>
61+
</xsl:template>
2662
<xsl:template match="tei:p[./tei:span[@class='pagenumber']]">
27-
<fw type="pageNum"><xsl:value-of select=".//text()"/></fw>
63+
<xsl:choose>
64+
<xsl:when test="following-sibling::tei:p[@class='ff']">
65+
66+
</xsl:when>
67+
<xsl:otherwise>
68+
<fw type="pageNum"><xsl:value-of select=".//text()"/></fw>
69+
</xsl:otherwise>
70+
</xsl:choose>
2871
</xsl:template>
2972
<xsl:template match="tei:p[@class='marginalie_place']">
3073
<p rendition="#marginalie_place"><xsl:apply-templates/></p>
@@ -35,9 +78,9 @@
3578
<xsl:template match="tei:p[@class='footnote footnote-ff']">
3679
<note type="footnote" prev="true"><xsl:apply-templates/></note>
3780
</xsl:template>
38-
<xsl:template match="tei:p[@class='ff']">
81+
<!-- <xsl:template match="tei:p[@class='ff']">
3982
<p prev="true"><xsl:apply-templates/></p>
40-
</xsl:template>
83+
</xsl:template> -->
4184

4285
<!--
4386
####################

freud_api_crawler/freud_api_crawler.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -426,11 +426,7 @@ def make_xml(self, save=False, limit=True):
426426
page_json = self.get_page(x['id'])
427427
pp = self.process_page(page_json)
428428
div = ET.fromstring(pp['body'])
429-
pb_el = make_pb(
430-
pp['page_nr'],
431-
f"{FRD_BASE}{pp['faks__payload']}",
432-
pp['faks__id']
433-
)
429+
pb_el = make_pb(pp)
434430
cur_div = div.xpath('//tei:div', namespaces=self.nsmap)[0]
435431
cur_div.insert(0, pb_el)
436432
body.append(div)

freud_api_crawler/string_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
('\n', ''),
55
('-<br />', '<lb break="no"/>'),
66
('<br />', '<lb />\n'),
7-
('</p>', '</p>\n'),
7+
('</p>', '<lb break="paragraph"/></p>\n'),
8+
('-<lb break="paragraph"/></p>', '<lb break="no"/></p>'),
89
('‚', ','),
910
('ı', 'i')
1011
]

freud_api_crawler/tei_utils.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
11
import lxml.etree as ET
22

33

4-
def make_pb(n, faks_url, faks_id):
4+
def make_pb(json):
55
""" returns a tei:pb
66
"""
77
pb_el = ET.Element("{http://www.tei-c.org/ns/1.0}pb")
8-
pb_el.attrib['n'] = f"{n}"
9-
pb_el.attrib['facs'] = f"{faks_url}"
8+
pb_el.attrib['n'] = f"{json['page_nr']}"
109
pb_el.attrib[
1110
"{http://www.w3.org/XML/1998/namespace}id"
12-
] = f"faks__{faks_id}"
11+
] = f"page__{json['id']}"
1312

1413
return pb_el

tests/test_freud_api_crawler.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,10 +205,13 @@ def tearDown(self):
205205
def test_001_make_pg(self):
206206
""" Test make_pb"""
207207
pb_el = tei_utils.make_pb(
208-
1, 'https://whatever.com', "1234sieben"
208+
{
209+
"page_nr": 1,
210+
"id": "xyz"
211+
}
209212
)
210213
pb_str = ET.tostring(pb_el).decode('utf-8')
211214
self.assertEqual(
212215
pb_str,
213-
'<ns0:pb xmlns:ns0="http://www.tei-c.org/ns/1.0" n="1" facs="https://whatever.com" xml:id="faks__1234sieben"/>' # noqa: E501
216+
'<ns0:pb xmlns:ns0="http://www.tei-c.org/ns/1.0" n="1" xml:id="page__xyz"/>' # noqa: E501
214217
)

0 commit comments

Comments
 (0)