Skip to content

Commit dff0cfa

Browse files
committed
Merge dev into main
2 parents 1db2a05 + 845e104 commit dff0cfa

32 files changed

+600
-256
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Powerful Python util methods and classes that simplify common apis and tasks.
44

5-
![Current Release](https://img.shields.io/badge/release-v2.56.0-blue)
5+
![Current Release](https://img.shields.io/badge/release-v2.56.6-blue)
66
[![codecov](https://codecov.io/gh/owasp-sbot/OSBot-Utils/graph/badge.svg?token=GNVW0COX1N)](https://codecov.io/gh/owasp-sbot/OSBot-Utils)
77

88

osbot_utils/helpers/html/Dict__To__Css.py renamed to osbot_utils/helpers/html/CSS_Dict__To__Css.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from osbot_utils.base_classes.Kwargs_To_Self import Kwargs_To_Self
22

3-
4-
class Dict__To__Css(Kwargs_To_Self):
3+
class CSS_Dict__To__Css(Kwargs_To_Self):
54
css: dict
65

76

osbot_utils/helpers/html/Dict__To__Html.py renamed to osbot_utils/helpers/html/Html_Dict__To__Html.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
from osbot_utils.helpers.html.Html__To__Html_Dict import STRING__SCHEMA_TEXT, STRING__SCHEMA_NODES
2+
13
HTML_SELF_CLOSING_TAGS = {'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'}
24
HTML_DEFAULT_DOCTYPE_VALUE = "<!DOCTYPE html>\n"
35

4-
class Dict__To__Html:
6+
class Html_Dict__To__Html:
57
def __init__(self, root, include_doctype=True, doctype=HTML_DEFAULT_DOCTYPE_VALUE):
68
self.self_closing_tags = HTML_SELF_CLOSING_TAGS # Define a list of self-closing tags
79
self.root = root
@@ -35,41 +37,41 @@ def convert_attrs(self, attrs):
3537
def convert_element(self, element, indent_level):
3638
"""Recursively converts a dictionary to an HTML string with indentation."""
3739
# Check if this is a text node
38-
if element.get("type") == "text":
40+
if element.get("type") == STRING__SCHEMA_TEXT:
3941
return element.get("data", "") # Return text content directly for text nodes
4042

41-
tag = element.get("tag")
42-
attrs = element.get("attrs", {})
43-
children = element.get("children", [])
43+
tag = element.get("tag")
44+
attrs = element.get("attrs", {})
45+
nodes = element.get(STRING__SCHEMA_NODES, [])
4446

4547
attrs_str = self.convert_attrs(attrs) # Convert attributes dictionary to a string
4648
indent = " " * indent_level # Indentation for the current level, assuming 4 spaces per indent level
4749

4850
# Handle self-closing tags
49-
if tag in self.self_closing_tags and not children: # Check if the tag is self-closing and has no children
51+
if tag in self.self_closing_tags and not nodes: # Check if the tag is self-closing and has no nodes
5052
return f"{indent}<{tag}{attrs_str} />\n"
5153

5254
# Start building the HTML
5355
html = f"{indent}<{tag}{attrs_str}>" # Opening tag with indentation
5456

55-
# Separate children into text nodes and element nodes
56-
text_nodes = [child for child in children if child.get("type") == "text"]
57-
element_nodes = [child for child in children if child.get("type") != "text"]
57+
# Separate nodes into text nodes and element nodes
58+
text_nodes = [node for node in nodes if node.get("type") == STRING__SCHEMA_TEXT]
59+
element_nodes = [node for node in nodes if node.get("type") != STRING__SCHEMA_TEXT]
5860

5961
# If there are only element nodes, add a newline after the opening tag
6062
if element_nodes and not text_nodes:
6163
html += "\n"
6264

63-
# Process children, maintaining the original order but with proper formatting
64-
if children:
65+
# Process nodes, maintaining the original order but with proper formatting
66+
if nodes:
6567
# Track if we're currently in a text section or element section
6668
# This helps us add newlines only between elements, not text
6769
previous_was_element = False
6870

69-
for child in children:
70-
if child.get("type") == "text":
71+
for node in nodes:
72+
if node.get("type") == STRING__SCHEMA_TEXT:
7173
# Text node - directly append content
72-
html += child.get("data", "")
74+
html += node.get("data", "")
7375
previous_was_element = False
7476
else:
7577
# Element node - format with proper indentation
@@ -78,14 +80,14 @@ def convert_element(self, element, indent_level):
7880
if not html.endswith("\n"):
7981
html += "\n"
8082

81-
html += self.convert_element(child, indent_level + 1)
83+
html += self.convert_element(node, indent_level + 1)
8284
previous_was_element = True
8385

8486
# Handle closing tag based on content
8587
if element_nodes and not text_nodes:
86-
# If only element children, add indented closing tag
88+
# If only element nodes, add indented closing tag
8789
html += f"{indent}</{tag}>\n"
88-
elif children: # Any type of children
90+
elif nodes: # Any type of nodes
8991
# If mixed content or only text, add closing tag without indentation
9092
html += f"</{tag}>\n"
9193
else:
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from typing import Dict, Union, Any
2+
from osbot_utils.helpers.html.Html__To__Html_Dict import STRING__SCHEMA_TEXT, STRING__SCHEMA_NODES
3+
from osbot_utils.helpers.html.schemas.Schema__Html_Document import Schema__Html_Document
4+
from osbot_utils.helpers.html.schemas.Schema__Html_Node import Schema__Html_Node
5+
from osbot_utils.helpers.html.schemas.Schema__Html_Node__Data import Schema__Html_Node__Data
6+
from osbot_utils.helpers.html.schemas.Schema__Html_Node__Data__Type import Schema__Html_Node__Data__Type
7+
from osbot_utils.type_safe.Type_Safe import Type_Safe
8+
9+
10+
class Html_Dict__To__Html_Document(Type_Safe):
11+
html__dict : dict = None
12+
html__document: Schema__Html_Document = None
13+
14+
def convert(self):
15+
self.html__document = self.parse_html_dict(self.html__dict)
16+
return self.html__document
17+
18+
def parse_html_dict(self, target: Dict[str, Any]) -> Schema__Html_Document:
19+
if not target or not isinstance(target, dict):
20+
raise ValueError("Invalid HTML dictionary structure")
21+
22+
root_node = self.parse_node(target)
23+
return Schema__Html_Document(root_node=root_node)
24+
25+
def parse_node(self, target: Dict[str, Any]) -> Union[Schema__Html_Node, Schema__Html_Node__Data]:
26+
27+
if target.get('type') == STRING__SCHEMA_TEXT: # Handle text nodes
28+
return Schema__Html_Node__Data(data = target.get('data', ''),
29+
type = Schema__Html_Node__Data__Type.TEXT)
30+
else: # Handle element nodes
31+
nodes = []
32+
for node in target.get(STRING__SCHEMA_NODES, []):
33+
nodes.append(self.parse_node(node))
34+
35+
return Schema__Html_Node(attrs = target.get('attrs', {}) ,
36+
nodes = nodes ,
37+
tag = target.get('tag', ''))

osbot_utils/helpers/html/Dict__To__Tags.py renamed to osbot_utils/helpers/html/Html_Dict__To__Html_Tags.py

Lines changed: 37 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
1-
from osbot_utils.helpers.html.Dict__To__Html import HTML_SELF_CLOSING_TAGS
2-
from osbot_utils.helpers.html.Tag__Base import Tag__Base
3-
from osbot_utils.helpers.html.Tag__Body import Tag__Body
4-
from osbot_utils.helpers.html.Tag__Head import Tag__Head
5-
from osbot_utils.helpers.html.Tag__Html import Tag__Html
6-
from osbot_utils.helpers.html.Tag__Link import Tag__Link
7-
from osbot_utils.helpers.html.Tag__Text import Tag__Text
1+
from osbot_utils.helpers.html.Html_Dict__To__Html import HTML_SELF_CLOSING_TAGS
2+
from osbot_utils.helpers.html.Html__To__Html_Dict import STRING__SCHEMA_TEXT, STRING__SCHEMA_NODES
3+
from osbot_utils.helpers.html.Tag__Base import Tag__Base
4+
from osbot_utils.helpers.html.Tag__Body import Tag__Body
5+
from osbot_utils.helpers.html.Tag__Head import Tag__Head
6+
from osbot_utils.helpers.html.Tag__Html import Tag__Html
7+
from osbot_utils.helpers.html.Tag__Link import Tag__Link
8+
from osbot_utils.helpers.html.Tag__Text import Tag__Text
89

910

10-
class Dict__To__Tags:
11+
class Html_Dict__To__Html_Tags:
1112

1213
def __init__(self, root):
1314
self.root = root
@@ -30,25 +31,25 @@ def convert_element(self, element):
3031
return self.convert_to__tag(Tag__Base, element, 0) # Default indent 0
3132

3233
def collect_inner_text(self, element):
33-
"""Extract all text from an element's text node children."""
34+
"""Extract all text from an element's text node nodes."""
3435
inner_text = ""
35-
for child in element.get("children", []):
36-
if child.get("type") == "text":
37-
inner_text += child.get("data", "")
36+
for node in element.get(STRING__SCHEMA_NODES, []):
37+
if node.get("type") == STRING__SCHEMA_TEXT:
38+
inner_text += node.get("data", "")
3839
return inner_text
3940

4041
def convert_to__tag(self, target_tag, element, indent):
41-
if element.get("type") == "text":
42+
if element.get("type") == STRING__SCHEMA_TEXT:
4243
# Handle text nodes directly
4344
return Tag__Text(element.get("data", ""))
4445

4546
tag_name = element.get("tag")
4647
attrs = element.get("attrs", {})
47-
children = element.get("children", [])
48+
nodes = element.get(STRING__SCHEMA_NODES, [])
4849
end_tag = tag_name not in HTML_SELF_CLOSING_TAGS
4950
tag_indent = indent + 1
5051

51-
# Collect inner text from all text node children
52+
# Collect inner text from all text node nodes
5253
inner_html = self.collect_inner_text(element)
5354

5455
tag_kwargs = dict(
@@ -61,60 +62,60 @@ def convert_to__tag(self, target_tag, element, indent):
6162

6263
tag = target_tag(**tag_kwargs)
6364

64-
# Process only element nodes as children (text is already handled via inner_html)
65-
for child in children:
66-
if child.get("type") != "text": # Skip text nodes, they're in inner_html
67-
child_tag = self.convert_to__tag(Tag__Base, child, tag_indent)
65+
# Process only element nodes as nodes (text is already handled via inner_html)
66+
for node in nodes:
67+
if node.get("type") != STRING__SCHEMA_TEXT: # Skip text nodes, they're in inner_html
68+
child_tag = self.convert_to__tag(Tag__Base, node, tag_indent)
6869
tag.elements.append(child_tag)
6970

7071
return tag
7172

7273
def convert_to__tag__head(self, element, indent):
73-
attrs = element.get("attrs", {})
74-
children = element.get("children", [])
74+
attrs = element.get("attrs", {})
75+
nodes = element.get(STRING__SCHEMA_NODES, [])
7576

7677
head_indent = indent + 1
7778
tag_head = Tag__Head(indent=head_indent, **attrs)
7879

79-
for child in children:
80-
tag_name = child.get("tag")
80+
for node in nodes:
81+
tag_name = node.get("tag")
8182

8283
if tag_name == 'title':
83-
# Extract title text from text node children
84-
tag_head.title = self.collect_inner_text(child)
84+
# Extract title text from text node nodes
85+
tag_head.title = self.collect_inner_text(node)
8586
elif tag_name == 'link':
86-
tag_head.links.append(self.convert_to__tag__link(child))
87+
tag_head.links.append(self.convert_to__tag__link(node))
8788
elif tag_name == 'meta':
88-
tag_head.elements.append(self.convert_to__tag(Tag__Base, child, head_indent))
89+
tag_head.elements.append(self.convert_to__tag(Tag__Base, node, head_indent))
8990
elif tag_name == 'style':
9091
# For style tags, collect the CSS content from text nodes
91-
style_element = self.convert_to__tag(Tag__Base, child, head_indent)
92+
style_element = self.convert_to__tag(Tag__Base, node, head_indent)
9293
tag_head.elements.append(style_element)
9394
else:
9495
# Handle any other head elements
95-
tag_head.elements.append(self.convert_to__tag(Tag__Base, child, head_indent))
96+
tag_head.elements.append(self.convert_to__tag(Tag__Base, node, head_indent))
9697

9798
return tag_head
9899

99100
def convert_to__tag__html(self, element):
100-
attrs = element.get("attrs", {})
101-
children = element.get("children", [])
102-
lang = attrs.get("lang")
101+
attrs = element.get("attrs", {})
102+
nodes = element.get(STRING__SCHEMA_NODES, [])
103+
lang = attrs.get("lang")
103104

104105
tag_html = Tag__Html(attributes=attrs, lang=lang, doc_type=False)
105106

106107
# Initialize head and body if not found
107108
head_found = False
108109
body_found = False
109110

110-
for child in children:
111-
tag_name = child.get("tag")
111+
for node in nodes:
112+
tag_name = node.get("tag")
112113

113114
if tag_name == 'head':
114-
tag_html.head = self.convert_to__tag__head(child, tag_html.indent)
115+
tag_html.head = self.convert_to__tag__head(node, tag_html.indent)
115116
head_found = True
116117
elif tag_name == 'body':
117-
tag_html.body = self.convert_to__tag(Tag__Body, child, tag_html.indent)
118+
tag_html.body = self.convert_to__tag(Tag__Body, node, tag_html.indent)
118119
body_found = True
119120
else:
120121
# Log unexpected child elements of html

osbot_utils/helpers/html/Html__To__Dict.py renamed to osbot_utils/helpers/html/Html__To__Html_Dict.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
from html.parser import HTMLParser
22

33
HTML_SELF_CLOSING_TAGS = {'area', 'base', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'link', 'meta', 'param', 'source', 'track', 'wbr'}
4-
STRING__DATA_TEXT = 'TEXT:'
4+
STRING__SCHEMA_TEXT = 'TEXT'
5+
STRING__SCHEMA_NODES = 'nodes'
6+
STRING__DATA_TEXT = f'{STRING__SCHEMA_TEXT}:'
57

6-
class Html__To__Dict(HTMLParser):
8+
9+
class Html__To__Html_Dict(HTMLParser):
710
def __init__(self, html):
811
super().__init__()
912
self.root = None # No root initially
@@ -18,15 +21,15 @@ def convert(self):
1821
return self.root
1922

2023
def handle_starttag(self, tag, attrs):
21-
new_tag = {"tag": tag, "attrs": dict(attrs), "children": []}
24+
new_tag = {"tag": tag, "attrs": dict(attrs), STRING__SCHEMA_NODES: []}
2225

2326
if self.current is None:
2427
# When the first tag is encountered, it becomes the root
2528
self.root = new_tag
2629
self.current = new_tag
2730
else:
2831
# Otherwise, append the new tag as a child of the current tag
29-
self.current["children"].append(new_tag)
32+
self.current[STRING__SCHEMA_NODES].append(new_tag)
3033

3134
# If this tag is not a void element, push it onto the stack
3235
if tag.lower() not in self.void_elements:
@@ -48,33 +51,33 @@ def handle_endtag(self, tag):
4851
def handle_data(self, data):
4952
if data.strip(): # Ignore whitespace
5053
# Create a text node as a child
51-
text_node = {"type": "text", "data": data}
52-
self.current["children"].append(text_node)
54+
text_node = {"type": STRING__SCHEMA_TEXT, "data": data}
55+
self.current[STRING__SCHEMA_NODES].append(text_node)
5356

5457
def print__generate_lines(self, node, indent="", last=True, is_root=True):
5558
lines = []
5659

5760
prefix = "" if is_root else ("└── " if last else "├── ")
5861

59-
if node.get("type") == "text":
62+
if node.get("type") == STRING__SCHEMA_TEXT:
6063
text_data = node.get('data')
6164
if self.strip_text_data:
6265
text_data = text_data.strip()
6366
lines.append(f"{indent}{prefix}{STRING__DATA_TEXT} {text_data}")
6467
else:
6568
tag = node.get("tag")
6669
attrs = node.get("attrs", {})
67-
children = node.get("children", [])
70+
nodes = node.get(STRING__SCHEMA_NODES, [])
6871
attrs_str = ' '.join(f'{key}="{value}"' for key, value in attrs.items())
6972
attrs_str = f' ({attrs_str})' if attrs_str else ''
7073

7174
lines.append(f"{indent}{prefix}{tag}{attrs_str}")
7275

7376
child_indent = indent + (" " if last else "│ ")
7477

75-
for i, child in enumerate(children):
76-
is_last = i == len(children) - 1
77-
child_lines = self.print__generate_lines(child, indent=child_indent, last=is_last, is_root=False)
78+
for i, node in enumerate(nodes):
79+
is_last = i == len(nodes) - 1
80+
child_lines = self.print__generate_lines(node, indent=child_indent, last=is_last, is_root=False)
7881
lines.extend(child_lines if isinstance(child_lines, list) else [child_lines])
7982

8083
return lines if is_root else "\n".join(lines)
@@ -95,7 +98,7 @@ def print__lines(self, lines):
9598

9699
def html_to_dict(html_code: str) -> dict:
97100
try:
98-
html_to_dict = Html__To__Dict(html_code)
101+
html_to_dict = Html__To__Html_Dict(html_code)
99102
html_dict = html_to_dict.convert()
100103
return html_dict
101104
except: # todo: see if there is a better Exception to capture
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from osbot_utils.helpers.html.Html_Dict__To__Html_Document import Html_Dict__To__Html_Document
2+
from osbot_utils.helpers.html.Html__To__Html_Dict import Html__To__Html_Dict
3+
from osbot_utils.helpers.html.schemas.Schema__Html_Document import Schema__Html_Document
4+
from osbot_utils.type_safe.Type_Safe import Type_Safe
5+
6+
7+
class Html__To__Html_Document(Type_Safe):
8+
html: str
9+
html__dict : dict
10+
html__document: Schema__Html_Document
11+
12+
def convert(self):
13+
if self.html:
14+
html__dict = Html__To__Html_Dict(self.html).convert()
15+
if html__dict:
16+
with Html_Dict__To__Html_Document(html__dict=html__dict).convert() as html__document:
17+
if html__document:
18+
self.html__document = html__document
19+
return html__document

0 commit comments

Comments
 (0)