-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract-classifications.js
More file actions
79 lines (66 loc) · 2.81 KB
/
Copy pathextract-classifications.js
File metadata and controls
79 lines (66 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
const fs = require('fs');
const path = require('path');
const { parse } = require('node-html-parser');
// Read the HTML file
const htmlPath = path.join(__dirname, 'classification.html');
const htmlContent = fs.readFileSync(htmlPath, 'utf-8');
// Parse HTML
const root = parse(htmlContent);
// Find all elements to get both <p> tags and <tr> tags in order
const allElements = root.querySelectorAll('p, tr');
const classifications = [];
let currentCategory = null;
allElements.forEach(element => {
// Check if it's a <p> tag with category information
if (element.tagName === 'P') {
const fonts = element.querySelectorAll('font');
if (fonts.length >= 2) {
// First font might have the code in <strong>
const codeStrong = fonts[0].querySelector('strong');
// Second font might have the category name in <strong>
const nameStrong = fonts[1].querySelector('strong');
if (codeStrong && nameStrong) {
const code = codeStrong.text.trim();
const name = nameStrong.text.trim();
if (code && name) {
currentCategory = { code, name };
}
}
}
}
// Check if it's a <tr> tag with classification information
if (element.tagName === 'TR') {
const cells = element.querySelectorAll('td');
if (cells.length === 3) {
const codeCell = cells[1].querySelector('font');
const nameCell = cells[2].querySelector('font');
if (codeCell && nameCell) {
const code = codeCell.text.trim();
const name = nameCell.text.trim();
if (code && name) {
// If the name is "General" and we have a current category, use the category name
if (name === 'General' && currentCategory) {
classifications.push({ code, name: "General " + currentCategory.name });
} else if (name !== 'General') {
classifications.push({ code, name });
}
}
}
}
}
});
// Write to JSON file
const jsonPath = path.join(__dirname, 'classifications.json');
fs.writeFileSync(jsonPath, JSON.stringify(classifications, null, 2));
// Also write to CSV
const csvPath = path.join(__dirname, 'classifications.csv');
const csvContent = 'code,name\n' +
classifications.map(c => `"${c.code}","${c.name}"`).join('\n');
fs.writeFileSync(csvPath, csvContent);
console.log(`✓ Extracted ${classifications.length} classifications`);
console.log(` Output: ${jsonPath}`);
console.log(` Output: ${csvPath}`);
console.log('\nFirst 5 entries:');
classifications.slice(0, 5).forEach(c => {
console.log(` ${c.code} → ${c.name}`);
});