Skip to content

Commit 4298c7a

Browse files
committed
measure hole parsing + determinization
1 parent d8aa39d commit 4298c7a

File tree

5 files changed

+155
-112
lines changed

5 files changed

+155
-112
lines changed

src/commonMain/kotlin/ai/hypergraph/kaliningraph/automata/FSM.kt

Lines changed: 114 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@ import ai.hypergraph.kaliningraph.automata.GRE.CAT
44
import ai.hypergraph.kaliningraph.automata.GRE.CUP
55
import ai.hypergraph.kaliningraph.automata.GRE.EPS
66
import ai.hypergraph.kaliningraph.automata.GRE.SET
7+
import ai.hypergraph.kaliningraph.sampling.LFSR
8+
import ai.hypergraph.kaliningraph.sampling.bigLFSRSequence
9+
import ai.hypergraph.kaliningraph.sampling.longLFSRSequence
710
import ai.hypergraph.kaliningraph.types.filter
11+
import com.ionspin.kotlin.bignum.integer.BigInteger
812

913
// Alternate to FSA; bypasses graph subtyping, basically just record types
1014
class DFSM(
@@ -223,6 +227,66 @@ open class NFSM(
223227
}
224228
}
225229

230+
fun NFSM.toDFSM(width: Int): DFSM {
231+
// Pre-index NFA transitions: from -> (symbol -> {to,...})
232+
val tmap: Map<String, Map<Int, Set<String>>> = run {
233+
val tmp = mutableMapOf<String, MutableMap<Int, MutableSet<String>>>()
234+
for ((from, a, to) in delta) {
235+
val row = tmp.getOrPut(from) { mutableMapOf() }
236+
row.getOrPut(a) { mutableSetOf() }.add(to)
237+
}
238+
tmp.mapValues { (_, row) -> row.mapValues { it.value.toSet() } }
239+
}
240+
241+
fun succ(states: Set<String>, a: Int): Set<String> {
242+
if (states.isEmpty()) return emptySet()
243+
val out = mutableSetOf<String>()
244+
for (s in states) {
245+
val row = tmap[s] ?: continue
246+
val tgt = row[a] ?: continue
247+
out.addAll(tgt)
248+
}
249+
return out
250+
}
251+
252+
// Canonical name for a subset of NFA states
253+
fun nameOf(S: Set<String>) = S.sorted().joinToString("|").ifEmpty { "" }
254+
255+
val alphabet = 0 until width
256+
val q0set = setOf(q_alpha)
257+
258+
val subset2name = LinkedHashMap<Set<String>, String>()
259+
val queue = ArrayDeque<Set<String>>()
260+
val deltaMap = mutableMapOf<String, MutableMap<Int, String>>()
261+
val finals = mutableSetOf<String>()
262+
263+
subset2name[q0set] = "q0"
264+
queue.add(q0set)
265+
266+
while (queue.isNotEmpty()) {
267+
val S = queue.removeFirst()
268+
val sName = subset2name[S]!!
269+
if (S.any { it in F }) finals.add(sName)
270+
271+
val row = deltaMap.getOrPut(sName) { mutableMapOf() }
272+
for (a in alphabet) {
273+
val T = succ(S, a)
274+
if (T.isEmpty()) continue // no sink
275+
val tName = subset2name.getOrPut(T) {
276+
val n = "q${subset2name.size}"
277+
queue.add(T)
278+
n
279+
}
280+
row[a] = tName
281+
}
282+
}
283+
284+
val Qd = subset2name.values.toSet()
285+
return DFSM(Qd, deltaMap, "q0", finals, width)
286+
}
287+
288+
fun GRE.toDFSM(terms: List<String>): DFSM = toNFSM().toDFSM(terms.size)
289+
226290
fun GRE.toNFSM(): NFSM {
227291
var stateCounter = 0
228292
fun freshState(): String = "q${stateCounter++}"
@@ -306,65 +370,6 @@ fun GRE.toNFSM(): NFSM {
306370
return buildNFSM(this)
307371
}
308372

309-
fun GRE.toDFSM(): DFSM {
310-
val width = this.width
311-
val alphabet = 0 until width
312-
313-
// Handle empty language
314-
if (enumerate().none()) {
315-
val sink = "sink"
316-
return DFSM(
317-
Q = setOf(sink),
318-
deltaMap = mapOf(sink to alphabet.associateWith { sink }),
319-
q_alpha = sink,
320-
F = emptySet(),
321-
width = width
322-
)
323-
}
324-
325-
val languageToState = mutableMapOf<String, String>()
326-
val queue = ArrayDeque<GRE>()
327-
val deltaMap = mutableMapOf<String, MutableMap<Int, String>>()
328-
val F = mutableSetOf<String>()
329-
val sink = "sink"
330-
331-
// Initial state
332-
val q0 = "q0"
333-
languageToState[hash()] = q0
334-
queue.add(this)
335-
336-
while (queue.isNotEmpty()) {
337-
val currentGRE = queue.removeFirst()
338-
val currentLang = currentGRE.hash()
339-
val currentState = languageToState[currentLang]!!
340-
341-
// Set accepting state
342-
if (currentGRE.nullable) F.add(currentState)
343-
344-
// Compute transitions
345-
forin alphabet) {
346-
val derivative = currentGRE.dv(σ)
347-
val targetState = if (derivative != null) {
348-
val derivLang = derivative.hash()
349-
languageToState.getOrPut(derivLang) {
350-
val newState = "q${languageToState.size}"
351-
queue.add(derivative)
352-
newState
353-
}
354-
} else {
355-
sink
356-
}
357-
deltaMap.getOrPut(currentState) { mutableMapOf() }[σ] = targetState
358-
}
359-
}
360-
361-
// Configure sink state
362-
deltaMap[sink] = alphabet.associateWith { sink }.toMutableMap()
363-
val Q = languageToState.values.toSet() + sink
364-
365-
return DFSM(Q, deltaMap, q0, F, width)
366-
}
367-
368373
fun DFSM.printAdjMatrixPowers() {
369374
// Build adjacency list from deltaMap
370375
val adj = Q.associateWith { q -> deltaMap[q]?.values?.toSet() ?: emptySet() }
@@ -456,4 +461,54 @@ fun DFSM.printAdjMatrixPowers() {
456461
current = multiply(current, A)
457462
k++
458463
}
464+
}
465+
466+
fun DFSM.sampleUniformly(tmLst: List<String>): Sequence<String> = sequence {
467+
// Precompute (and memoize) the number of accepted words from each state.
468+
val memo = HashMap<String, Long>()
469+
fun countFrom(q: String): Long {
470+
memo[q]?.let { return it }
471+
val row = deltaMap[q].orEmpty()
472+
var sum = 0L
473+
for ((_, next) in row) sum += countFrom(next)
474+
val res = if (q in F) 1L + sum else sum // +1 for epsilon at finals
475+
memo[q] = res
476+
return res
477+
}
478+
479+
val total = countFrom(q_alpha)
480+
require(total > 0L) { "Language is empty; no words to sample." }
481+
482+
// Decode a rank r ∈ [0, total) into a word (as symbol indices joined by spaces).
483+
fun decode(r0: Long): String {
484+
var r = r0
485+
var q = q_alpha
486+
val out = mutableListOf<Int>()
487+
488+
while (true) {
489+
// If current state is final, epsilon contributes the first block of mass.
490+
if (q in F) {
491+
if (r == 0L) return out.joinToString(" ") { tmLst[it] }
492+
r -= 1L
493+
}
494+
495+
// Walk one symbol along the unique branch containing r.
496+
val row = deltaMap[q].orEmpty()
497+
var stepped = false
498+
for (a in row.keys.sorted()) {
499+
val nxt = row[a]!!
500+
val cnt = memo[nxt] ?: countFrom(nxt)
501+
if (r < cnt) {
502+
out += a
503+
q = nxt
504+
stepped = true
505+
break
506+
}
507+
r -= cnt
508+
}
509+
check(stepped) { "Rank out of range while decoding (graph must be acyclic and counts finite)." }
510+
}
511+
}
512+
513+
for (r in longLFSRSequence(total)) yield(decode(r))
459514
}

src/commonMain/kotlin/ai/hypergraph/kaliningraph/parsing/CFG.kt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ val CFG.symLst by cache { (symbols + "ε").toList() }
6060
val CFG.symMap by cache { symLst.mapIndexed { i, s -> s to i }.toMap() }
6161

6262
val CFG.tmLst: List<Σᐩ> by cache { terminals.toList() }
63+
val CFG.tmDict: TermDict by cache { TermDict(terminals) }
6364
val CFG.tmMap: Map<Σᐩ, Int> by cache { tmLst.mapIndexed { i, s -> s to i }.toMap() }
6465
val CFG.tmToVidx: List<List<Int>> by cache { List(tmLst.size) { bimap.TDEPS[tmLst[it]]!!.map { bindex[it] } } }
6566
val CFG.terminalLists: List<Set<Σᐩ>> by cache { nonterminals.map { bimap.UNITS[it] ?: emptySet() } }

src/commonMain/kotlin/ai/hypergraph/kaliningraph/sampling/Samplers.kt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,14 @@ fun bigLFSRSequence(maxVal: BigInteger): Sequence<BigInteger> =
282282
BigLFSR(makeBigIntFromTaps(Polynomials.xlinz[maxVal.bitLength()]!!), makeRandBigInt(maxVal.bitLength()))
283283
.sequence().filter { it < maxVal }
284284

285+
fun longLFSRSequence(maxVal: Long): Sequence<Long> =
286+
if (maxVal <= 0L) emptySequence() else run {
287+
var p = 1UL; var d = 0
288+
val lim = (maxVal + 1L).toULong()
289+
while (p < lim) { p = p shl 1; d++ }
290+
LFSR(d).map { it.toLong() }.filter { it <= maxVal }.map { it - 1L } // 0..maxVal-1
291+
}
292+
285293
fun makeBigIntFromTaps(taps: List<Int>): BigInteger =
286294
taps.map {
287295
BigInteger.parseString(Array(it + 1) { if (it == 0) '1' else '0' }.joinToString(""), 2)

src/jvmTest/kotlin/ai/hypergraph/kaliningraph/repair/KotlinTypeChecker.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ object KotlinTypeChecker {
1313
}
1414

1515
fun typeChecks(src: String): Boolean = try {
16-
(engine as Compilable).compile(src) // compile only
16+
(engine as Compilable).compile(src.replace("Bool", "Boolean"))
1717
true
1818
} catch (e: ScriptException) {
1919
System.err.println(e.message)

src/jvmTest/kotlin/ai/hypergraph/kaliningraph/repair/ProbabilisticLBH.kt

Lines changed: 31 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -554,38 +554,58 @@ class ProbabilisticLBH {
554554
@Test
555555
fun testMiniKTAPI() {
556556
val cfg = miniktcfgapi
557+
val timer = TimeSource.Monotonic.markNow()
557558

558559
// val str = "fun f1 ( ) : Int = 1 ; f1 ( )"
559560
// val str = "fun f1 ( x : Int , y : Int ) : Int = x + y ; f1 ( _ _ _ )"
560-
val str = "fun f0 ( p1 : Float , p2 : Float ) : Int = ( if ( p1 == p2 ) { 1 } else { 1 } ) + 1"
561+
val str = "fun f0 ( p1 : Float , p2 : Float ) : Bool = ( if ( p1 == p2 ) { 1 } else { 1 } ) + 1"
561562

562-
println(str.matches(cfg))
563-
println(KotlinTypeChecker.typeChecks(str))
563+
println("CFG recognizes: " + str.matches(cfg))
564+
println("Kotlin recognizes: " + KotlinTypeChecker.typeChecks(str))
564565

565-
val t = initiateSerialRepair(str.tokenizeByWhitespace(), cfg).take(10).toList()
566+
// val t = initiateSerialRepair(str.tokenizeByWhitespace(), cfg).take(10).toList()
567+
val t = repairWithSparseGRE(str.tokenizeByWhitespace(), cfg)!!
568+
.also {
569+
println("GRE obtained in: ${timer.elapsedNow()}")
570+
println("Total words: ${it.toDFSM(cfg.tmLst).countWords()}")
571+
}
572+
.toDFA(cfg.tmLst).apply {
573+
println("Pre-minimization: ${states.size} states")
574+
minimize()
575+
println("DFA minimization in ${timer.elapsedNow()} with ${states.size} states")
576+
}
577+
.decodeDFA(cfg.tmDict).take(1000)
578+
.also { println("Found ${it.size} words empirically") }
566579
assertTrue(t.isNotEmpty())
567-
t.forEach {
580+
t.forEachIndexed { i, it ->
568581
assertTrue(KotlinTypeChecker.typeChecks(it), "Failed type check! $it")
569-
println("" + levenshteinAlign(str, it).paintANSIColors())
582+
if (i < 10) println("" + levenshteinAlign(str, it).paintANSIColors())
570583
}
571584

585+
println("Repair finished in ${timer.elapsedNow()}")
586+
572587
benchmarkMiniKt()
573588
}
574589

575590
fun benchmarkMiniKt() {
576591
val cfg = miniktcfgapi
577-
val tempLen = 25
592+
val tempLen = 20
578593
val timer = TimeSource.Monotonic.markNow()
579594
var avgDelay = 0L
580595
var initDelay = 0L
581596
var avgDelayTimer = TimeSource.Monotonic.markNow()
582597
val samples = 1000
583-
cfg.sampleSeq(List(tempLen) {"_"}).take(1000).forEachIndexed { i, pp -> /*println(pp);*/
598+
val pt = cfg.startGRE(List(tempLen) {"_"})!!
599+
println("Parsed (_)^$tempLen in: ${timer.elapsedNow()}")
600+
val dfsm = pt.toDFSM(cfg.tmLst)
601+
println("|L_∩|: ${dfsm.countWords()} (in ${timer.elapsedNow()})")
602+
dfsm.sampleUniformly(cfg.tmLst).take(1000).onEachIndexed { i, pp ->
584603
if (i == 0) initDelay = timer.elapsedNow().inWholeMilliseconds
585604
avgDelay += avgDelayTimer.elapsedNow().inWholeMilliseconds
586605
avgDelayTimer = TimeSource.Monotonic.markNow()
587-
}
588-
println("Sampled length-$tempLen template from (${cfg.nonterminals.size}, ${cfg.tripleIntProds.size})-CFG in ${initDelay}ms (TTFS), ${avgDelay / samples.toDouble()} (μDELAY)")
606+
if (i < 10) println(pp)
607+
}.take(1000).toList().also { println("Found ${it.size} words empirically") }
608+
println("Sampled length-$tempLen template from (${cfg.nonterminals.size}, ${cfg.tripleIntProds.size})-CFG in ${initDelay}ms (TTFS), ${avgDelay / samples.toDouble()}ms (μDELAY)")
589609
}
590610

591611
/*
@@ -607,45 +627,4 @@ class ProbabilisticLBH {
607627
}
608628
println("Precision: ${precision / total.toDouble()}")
609629
}
610-
}
611-
612-
// NAME . NAME ( STRING , class = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
613-
// NAME . NAME ( STRING , class ** STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
614-
// NAME . NAME ( STRING , class = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
615-
// NAME . NAME ( STRING , NAME = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
616-
// NAME . NAME ( STRING , STRING = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
617-
// NAME . NAME ( STRING , ) = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
618-
// NAME . NAME ( STRING , NUMBER = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
619-
// NAME . NAME ( STRING , class + STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
620-
// NAME . NAME ( STRING , ... = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
621-
// NAME . NAME ( STRING , ) = ( ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
622-
// NAME ( NAME ( STRING , ) = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
623-
// NAME . NAME ( STRING , class * STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
624-
// NAME . NAME ( STRING , class - STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
625-
// NAME . NAME ( STRING , class not STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
626-
// NAME . NAME ( STRING , not + STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
627-
// NAME . NAME ( STRING , ) ( STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
628-
// NAME . NAME ( STRING , * + STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
629-
// NAME . NAME ( STRING ( ) = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
630-
// NAME . NAME ( STRING , ** + STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
631-
// NAME . NAME ( STRING , * - STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
632-
// NAME . NAME ( STRING , ) = STRING ( ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
633-
// NAME . NAME ( STRING , * not STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
634-
// NAME . NAME ( STRING , ( ) = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
635-
// NAME . NAME ( STRING , + + STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
636-
// NAME . NAME ( STRING , ** - STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
637-
// NAME . NAME ( STRING , None = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
638-
// NAME . NAME ( STRING , ** not STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
639-
// NAME . NAME ( STRING , - + STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
640-
// NAME . NAME ( STRING , + - STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
641-
// NAME . NAME ( STRING , True = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
642-
// NAME . NAME ( STRING , not - STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
643-
// NAME . NAME ( STRING , ) = ( STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
644-
// NAME . NAME ( STRING , - - STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
645-
// NAME . NAME ( STRING , [ ] = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
646-
// NAME . NAME ( STRING , { } = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
647-
// NAME . NAME ( STRING , not not STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
648-
// NAME . NAME ( ( STRING , ) = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
649-
// NAME . NAME ( [ STRING , ] = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
650-
// NAME . NAME ( STRING , lambda : STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
651-
// NAME . NAME ( { STRING , } = STRING ) . NAME ( STRING , NAME = NAME . NAME ( STRING ) ) NEWLINE
630+
}

0 commit comments

Comments
 (0)