-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_document.py
More file actions
833 lines (702 loc) · 34.1 KB
/
create_document.py
File metadata and controls
833 lines (702 loc) · 34.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
"""
Creates a full Word document for the Breast Cancer Detection project
Simple English — easy to understand
"""
from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_ALIGN_VERTICAL
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
import json, os
doc = Document()
# ── PAGE MARGINS ─────────────────────────────────────────────
section = doc.sections[0]
section.top_margin = Inches(1.0)
section.bottom_margin = Inches(1.0)
section.left_margin = Inches(1.2)
section.right_margin = Inches(1.2)
# ── HELPERS ──────────────────────────────────────────────────
def heading1(text):
p = doc.add_heading(text, level=1)
p.runs[0].font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
p.runs[0].font.size = Pt(18)
return p
def heading2(text):
p = doc.add_heading(text, level=2)
p.runs[0].font.color.rgb = RGBColor(0x1A, 0x1A, 0x2E)
p.runs[0].font.size = Pt(14)
return p
def heading3(text):
p = doc.add_heading(text, level=3)
p.runs[0].font.color.rgb = RGBColor(0x27, 0xAE, 0x60)
p.runs[0].font.size = Pt(12)
return p
def body(text, bold=False, italic=False, color=None):
p = doc.add_paragraph()
run = p.add_run(text)
run.font.size = Pt(11)
run.font.bold = bold
run.font.italic = italic
if color:
run.font.color.rgb = color
return p
def bullet(text, bold_part=None):
p = doc.add_paragraph(style='List Bullet')
if bold_part and bold_part in text:
parts = text.split(bold_part, 1)
run1 = p.add_run(parts[0])
run1.font.size = Pt(11)
run2 = p.add_run(bold_part)
run2.font.size = Pt(11)
run2.font.bold = True
run3 = p.add_run(parts[1])
run3.font.size = Pt(11)
else:
run = p.add_run(text)
run.font.size = Pt(11)
return p
def add_table(headers, rows, col_widths=None):
table = doc.add_table(rows=1+len(rows), cols=len(headers))
table.style = 'Table Grid'
# Header row
hdr = table.rows[0]
for i, h in enumerate(headers):
cell = hdr.cells[i]
cell.text = h
cell.paragraphs[0].runs[0].font.bold = True
cell.paragraphs[0].runs[0].font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
cell.paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER
# Red background
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
shd = OxmlElement('w:shd')
shd.set(qn('w:fill'), 'C0392B')
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:val'), 'clear')
tcPr.append(shd)
# Data rows
for ri, row_data in enumerate(rows):
row = table.rows[ri+1]
bg = 'F9F9F9' if ri % 2 == 0 else 'FFFFFF'
for ci, val in enumerate(row_data):
cell = row.cells[ci]
cell.text = str(val)
cell.paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER if ci > 0 else WD_ALIGN_PARAGRAPH.LEFT
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
shd = OxmlElement('w:shd')
shd.set(qn('w:fill'), bg)
shd.set(qn('w:color'), 'auto')
shd.set(qn('w:val'), 'clear')
tcPr.append(shd)
if col_widths:
for i, width in enumerate(col_widths):
for row in table.rows:
row.cells[i].width = Inches(width)
return table
def divider():
p = doc.add_paragraph('─' * 80)
p.runs[0].font.color.rgb = RGBColor(0xCC, 0xCC, 0xCC)
p.runs[0].font.size = Pt(8)
def add_image(path, width=5.5):
if os.path.exists(path):
doc.add_picture(path, width=Inches(width))
last = doc.paragraphs[-1]
last.alignment = WD_ALIGN_PARAGRAPH.CENTER
# ════════════════════════════════════════════════════════════
# TITLE PAGE
# ════════════════════════════════════════════════════════════
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run('\n\n\n')
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run('BREAST CANCER DETECTION SYSTEM')
run.font.size = Pt(26)
run.font.bold = True
run.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run('Using Machine Learning to Detect Cancer from Cell Measurements')
run.font.size = Pt(14)
run.font.color.rgb = RGBColor(0x55, 0x55, 0x55)
doc.add_paragraph()
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run('COS5029-B: Data Science for AI')
run.font.size = Pt(12)
run.font.bold = True
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run('569 Patients · 6 Machine Learning Models · 97.5% Accuracy')
run.font.size = Pt(11)
run.font.color.rgb = RGBColor(0x27, 0xAE, 0x60)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# TABLE OF CONTENTS
# ════════════════════════════════════════════════════════════
heading1('Table of Contents')
contents = [
('1', 'Project Overview'),
('2', 'The Dataset'),
('3', 'Step 1 — Exploring the Data'),
('4', 'Step 2 — Cleaning the Data'),
('5', 'Step 3 — Visualising the Data'),
('6', 'Step 4 — Splitting the Data'),
('7', 'Step 5 — Training 6 Machine Learning Models'),
('8', 'Step 6 — Evaluating the Models'),
('9', 'Step 7 — Cross Validation'),
('10', 'Step 8 — Confusion Matrix'),
('11', 'Step 9 — ROC Curve'),
('12', 'Step 10 — Building the Web App'),
('13', 'Final Results'),
('14', 'Conclusion'),
]
for num, title in contents:
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.3)
run = p.add_run(f'{num}. {title}')
run.font.size = Pt(11)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 1 — PROJECT OVERVIEW
# ════════════════════════════════════════════════════════════
heading1('1. Project Overview')
body('This project uses machine learning to predict whether a breast tumour is '
'cancerous (Malignant) or not cancerous (Benign), based on 30 measurements '
'taken from a cell scan (biopsy).')
doc.add_paragraph()
body('The Problem', bold=True)
body('When a doctor finds a lump in a patient, they take a small sample of cells — '
'called a biopsy. A computer then measures the shape, size, and texture of '
'those cells. The question we are trying to answer is:')
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.5)
run = p.add_run('"Is this tumour Malignant (cancerous) or Benign (not cancerous)?"')
run.font.size = Pt(12)
run.font.bold = True
run.font.italic = True
run.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
doc.add_paragraph()
body('Our Solution', bold=True)
body('We trained a machine learning model to look at the cell measurements and '
'answer that question automatically. It learned from 569 real patients and '
'can now detect cancer with 97.5% accuracy on patients it has never seen before.')
doc.add_paragraph()
add_table(
['Detail', 'Value'],
[
['Total Patients', '569'],
['Benign (No Cancer)', '357'],
['Malignant (Cancer)', '212'],
['Measurements per Patient', '30'],
['Missing Values', 'None'],
['Best Model', 'SVM — 97.5% accuracy'],
],
col_widths=[3.0, 3.0]
)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 2 — THE DATASET
# ════════════════════════════════════════════════════════════
heading1('2. The Dataset')
body('Breast Cancer Wisconsin (Diagnostic) Dataset — University of Wisconsin')
doc.add_paragraph()
body('What Are the 30 Measurements?', bold=True)
body('Each measurement comes from a digitised image of a biopsy. '
'A computer analyses the cell shapes and records 10 features. '
'Each feature is measured 3 times — mean, standard error, and worst — giving 30 total.')
doc.add_paragraph()
add_table(
['Measurement', 'What It Means'],
[
['Radius', 'How big the cell is'],
['Texture', 'How rough or smooth the surface is'],
['Perimeter', 'The distance around the cell'],
['Area', 'The total size of the cell'],
['Smoothness', 'Variation in the radius length'],
['Compactness', 'How round vs irregular the cell is'],
['Concavity', 'Number of dents in the cell surface'],
['Concave Points', 'Number of concave parts of the border'],
['Symmetry', 'How symmetrical the cell is'],
['Fractal Dimension','How complex the cell border is'],
],
col_widths=[2.5, 4.0]
)
doc.add_paragraph()
body('Each of these is recorded as:', bold=True)
bullet('Mean — the average measurement across all cells in the sample')
bullet('Standard Error (SE) — how much the measurement varies')
bullet('Worst — the largest measurement seen in the sample')
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 3 — STEP 1: EXPLORING
# ════════════════════════════════════════════════════════════
heading1('3. Step 1 — Exploring the Data')
body('Before touching anything we opened the dataset and looked at what was inside.')
doc.add_paragraph()
body('What We Found:', bold=True)
bullet('569 patients total in the dataset')
bullet('32 columns — ID number, Diagnosis (cancer/no cancer), and 30 measurements')
bullet('357 patients had Benign tumours (no cancer) — 62.8% of the dataset')
bullet('212 patients had Malignant tumours (cancer) — 37.2% of the dataset')
bullet('No missing values — every patient had all 30 measurements filled in')
doc.add_paragraph()
body('Why We Did This:', bold=True, color=RGBColor(0x27, 0xAE, 0x60))
body('You cannot build anything without first understanding what you are working with. '
'Like reading a recipe before you start cooking.')
doc.add_paragraph()
add_image('chart1_diagnosis_count.png', width=4.5)
p = doc.paragraphs[-1]
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
doc.add_paragraph()
p = doc.add_paragraph('Figure 1 — Cancer vs No Cancer patient count')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.runs[0].font.italic = True
p.runs[0].font.size = Pt(10)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 4 — STEP 2: CLEANING
# ════════════════════════════════════════════════════════════
heading1('4. Step 2 — Cleaning the Data')
body('Before training any model we needed to clean the data. '
'Think of it like washing vegetables before cooking — '
'you remove the bad parts and get everything ready.')
doc.add_paragraph()
heading2('Fix 1 — Removed the ID Column')
body('Every patient had a random reference number like 842302. '
'This number has no connection to cancer. '
'The computer would try to find patterns in these random numbers which would '
'cause confusion. We deleted the entire ID column.')
doc.add_paragraph()
heading2('Fix 2 — Checked for Missing Values')
body('Empty cells confuse the computer because it does not know what value to use. '
'We checked all 569 patients across all 32 columns. '
'Result: zero missing values. The dataset was complete. Nothing to fix here.')
doc.add_paragraph()
heading2('Fix 3 — Scaled the Numbers')
body('This was the most important fix. Look at the difference in size between two measurements:')
doc.add_paragraph()
add_table(
['Measurement', 'Smallest Value', 'Biggest Value', 'Problem'],
[
['area_mean', '143.5', '2501.0', 'Very large numbers'],
['smoothness_mean', '0.0526', '0.1634', 'Very small numbers'],
['fractal_dimension_mean','0.0500', '0.0974', 'Tiny numbers'],
],
col_widths=[2.5, 1.5, 1.5, 2.0]
)
doc.add_paragraph()
body('The computer treats bigger numbers as more important. '
'Without scaling it would focus almost entirely on area and ignore smoothness. '
'That is wrong — all measurements should be treated equally.')
doc.add_paragraph()
body('We used StandardScaler which adjusts all numbers so they sit in a similar range. '
'After scaling every measurement is treated with equal importance.')
doc.add_paragraph()
body('Why We Did This:', bold=True, color=RGBColor(0x27, 0xAE, 0x60))
body('Garbage in = garbage out. If the data is messy or unbalanced the model will learn '
'the wrong patterns and give wrong predictions.')
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 5 — STEP 3: VISUALISING
# ════════════════════════════════════════════════════════════
heading1('5. Step 3 — Visualising the Data')
body('Before teaching the computer we drew charts to understand the patterns with our own eyes. '
'If you can see the difference visually the computer can learn it too.')
doc.add_paragraph()
heading2('Chart 1 — Measurement Comparison')
body('We compared every key measurement between cancer and no cancer patients. '
'In almost every measurement cancer cells scored higher — '
'they are bigger, rougher, and more irregular.')
doc.add_paragraph()
add_image('chart2_feature_comparison.png', width=5.5)
p = doc.add_paragraph('Figure 2 — Cancer vs No Cancer measurement comparison')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.runs[0].font.italic = True
p.runs[0].font.size = Pt(10)
doc.add_paragraph()
heading2('Chart 2 — Strongest Cancer Indicators')
body('We ranked all 30 measurements by how strongly they predict cancer. '
'A score of 1.0 would mean it perfectly predicts cancer every time. '
'A score of 0.0 means it tells us nothing.')
doc.add_paragraph()
add_image('chart3_top_features.png', width=5.5)
p = doc.add_paragraph('Figure 3 — Top 10 measurements that predict cancer')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.runs[0].font.italic = True
p.runs[0].font.size = Pt(10)
doc.add_paragraph()
body('Top finding: concave_points_worst scored 0.79 — '
'the single strongest predictor of cancer in this dataset.')
doc.add_paragraph()
heading2('Chart 3 — How Cancer Cells Look Different')
body('We plotted the distribution of cell sizes and irregularity for both groups. '
'Healthy cells are mostly small and regular. '
'Cancer cells are bigger and more spread out. '
'The two groups are clearly separated.')
doc.add_paragraph()
add_image('chart4_distributions.png', width=5.5)
p = doc.add_paragraph('Figure 4 — Distribution of cell size and irregularity')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.runs[0].font.italic = True
p.runs[0].font.size = Pt(10)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 6 — STEP 4: SPLITTING
# ════════════════════════════════════════════════════════════
heading1('6. Step 4 — Splitting the Data')
body('Before training we divided the 569 patients into two separate groups.')
doc.add_paragraph()
add_table(
['Group', 'Size', 'Purpose'],
[
['Training Set', '455 patients (80%)', 'The computer learns from these'],
['Testing Set', '114 patients (20%)', 'We test the model on these — it has never seen them'],
],
col_widths=[2.0, 2.5, 3.0]
)
doc.add_paragraph()
body('Why We Did This:', bold=True, color=RGBColor(0x27, 0xAE, 0x60))
body('If we test the model on the same data it trained on it will always get 100% '
'because it already memorised the answers. '
'That is like giving a student the exam paper during revision. '
'We must test on data the model has never seen to get an honest score.')
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 7 — STEP 5: TRAINING
# ════════════════════════════════════════════════════════════
heading1('7. Step 5 — Training 6 Machine Learning Models')
body('We trained 6 different models on the 455 training patients. '
'Think of each model as a different doctor with their own way of thinking. '
'At the end we pick the one that performs best.')
doc.add_paragraph()
add_table(
['Model', 'How It Works', 'Accuracy'],
[
['Logistic Regression',
'Draws a straight line to separate cancer from no cancer',
'96.5%'],
['Decision Tree',
'Asks yes/no questions until it reaches a decision',
'93.0%'],
['Random Forest',
'100 decision trees all voting together — majority wins',
'97.4%'],
['Gradient Boosting',
'Each tree learns from the mistakes of the previous one',
'96.5%'],
['SVM',
'Finds the widest possible gap between cancer and no cancer',
'97.4%'],
['KNN',
'Finds the 5 most similar patients and copies their diagnosis',
'95.6%'],
],
col_widths=[2.2, 4.0, 1.3]
)
doc.add_paragraph()
heading2('How Each Model Works — Simple Explanation')
heading3('Logistic Regression')
body('Imagine drawing a line on a graph. On one side is cancer, on the other side is no cancer. '
'Logistic Regression finds the best straight line to separate the two groups. '
'Simple but very effective for medical data.')
heading3('Decision Tree')
body('Like a flowchart. It asks questions one by one: '
'"Is area_mean above 500? Yes → Is concavity above 0.3? Yes → Cancer." '
'Easy to understand but can be too specific to training data.')
heading3('Random Forest')
body('Instead of one decision tree it builds 100 trees. '
'Each tree looks at a random part of the data and votes. '
'The majority vote wins. Much more reliable than a single tree.')
heading3('Gradient Boosting')
body('Builds trees one at a time. Each new tree focuses specifically on fixing '
'the mistakes that the previous trees made. '
'Gets better with every round.')
heading3('SVM — Support Vector Machine')
body('Finds the widest possible gap (called a margin) between cancer and no cancer cells. '
'It places the decision boundary as far as possible from both groups. '
'Very effective when the two groups are clearly different — which they are here.')
heading3('KNN — K-Nearest Neighbours')
body('Does not learn any rules at all. When given a new patient it searches the '
'training data for the 5 most similar patients and copies their diagnosis. '
'Simple but surprisingly effective.')
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 8 — STEP 6: EVALUATING
# ════════════════════════════════════════════════════════════
heading1('8. Step 6 — Evaluating the Models')
body('After training we tested each model on the 114 patients it had never seen. '
'We used 4 different scoring methods to get a full honest picture.')
doc.add_paragraph()
heading2('The 4 Scoring Methods')
add_table(
['Metric', 'What It Measures', 'Why It Matters'],
[
['Accuracy', 'Out of all patients how many did it get right?',
'Overall performance'],
['Precision', 'When it says cancer how often is it actually cancer?',
'Avoids false alarms'],
['Recall', 'Out of all cancer patients how many did it catch?',
'Most important in medicine'],
['F1 Score', 'A balance between precision and recall',
'Good when data is unbalanced'],
],
col_widths=[1.8, 3.2, 2.5]
)
doc.add_paragraph()
body('Why Recall Is the Most Important Metric:', bold=True,
color=RGBColor(0xC0, 0x39, 0x2B))
body('In cancer detection missing a cancer patient is far more dangerous than '
'a false alarm. A false alarm means extra tests and worry. '
'A missed cancer means the patient receives no treatment. '
'We always want recall to be as high as possible.')
doc.add_paragraph()
heading2('Full Results Table')
try:
with open('model_results.json') as f:
results = json.load(f)
rows = []
for name in ['SVM','Random Forest','Logistic Regression',
'Gradient Boosting','KNN','Decision Tree']:
if name in results:
r = results[name]
rows.append([name,
f"{r['accuracy']:.1f}%",
f"{r['precision']:.1f}%",
f"{r['recall']:.1f}%",
f"{r['f1']:.1f}%"])
add_table(['Model','Accuracy','Precision','Recall','F1 Score'], rows,
col_widths=[2.5, 1.3, 1.3, 1.2, 1.2])
except:
body('See model_results.json for full results.')
doc.add_paragraph()
add_image('chart5_model_comparison.png', width=5.5)
p = doc.add_paragraph('Figure 5 — All 6 model scores side by side')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.runs[0].font.italic = True
p.runs[0].font.size = Pt(10)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 9 — STEP 7: CROSS VALIDATION
# ════════════════════════════════════════════════════════════
heading1('9. Step 7 — Cross Validation')
body('Getting a high score once does not mean the model is truly reliable. '
'We need to test it multiple times on different data to be sure.')
doc.add_paragraph()
body('How It Works:', bold=True)
body('We split the data into 5 equal parts. '
'We test 5 times — each time a different part is used for testing '
'and the rest for training. Then we average all 5 scores.')
doc.add_paragraph()
add_table(
['Test Round', 'Testing On', 'Training On'],
[
['Round 1', 'Part 1', 'Parts 2, 3, 4, 5'],
['Round 2', 'Part 2', 'Parts 1, 3, 4, 5'],
['Round 3', 'Part 3', 'Parts 1, 2, 4, 5'],
['Round 4', 'Part 4', 'Parts 1, 2, 3, 5'],
['Round 5', 'Part 5', 'Parts 1, 2, 3, 4'],
],
col_widths=[1.5, 1.8, 4.0]
)
doc.add_paragraph()
heading2('Cross Validation Results')
try:
with open('cv_results.json') as f:
cv = json.load(f)
rows = []
for name in ['SVM','Logistic Regression','KNN',
'Random Forest','Gradient Boosting','Decision Tree']:
if name in cv:
rows.append([name,
f"{cv[name]['mean']:.1f}%",
f"±{cv[name]['std']:.1f}%"])
add_table(['Model', 'Average Score', 'Consistency (±)'], rows,
col_widths=[3.0, 2.0, 2.0])
except:
body('See cv_results.json for full results.')
doc.add_paragraph()
body('Note: A smaller ± number means the model performs consistently '
'no matter what data it sees. That is more important than a high single score.',
italic=True)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 10 — STEP 8: CONFUSION MATRIX
# ════════════════════════════════════════════════════════════
heading1('10. Step 8 — Confusion Matrix')
body('The confusion matrix shows exactly what the model got right and wrong — '
'not just the overall score.')
doc.add_paragraph()
add_table(
['', 'Predicted: No Cancer', 'Predicted: Cancer'],
[
['Actual: No Cancer',
'TRUE NEGATIVE ✅\nCorrectly said No Cancer',
'FALSE POSITIVE ⚠️\nSaid Cancer but was wrong (False Alarm)'],
['Actual: Cancer',
'FALSE NEGATIVE ❌\nSaid No Cancer but missed it (Most Dangerous)',
'TRUE POSITIVE ✅\nCorrectly said Cancer'],
],
col_widths=[2.0, 3.0, 3.0]
)
doc.add_paragraph()
body('SVM Results on 114 Test Patients:', bold=True)
bullet('71 patients correctly identified as No Cancer')
bullet('39 patients correctly identified as Cancer')
bullet('0 false alarms — every cancer prediction was correct')
bullet('3 missed cancer cases — the only weakness')
doc.add_paragraph()
add_image('chart6_confusion_matrix.png', width=5.5)
p = doc.add_paragraph('Figure 6 — Confusion matrix for top 3 models')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.runs[0].font.italic = True
p.runs[0].font.size = Pt(10)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 11 — STEP 9: ROC CURVE
# ════════════════════════════════════════════════════════════
heading1('11. Step 9 — ROC Curve')
body('The ROC curve shows how well each model separates cancer from no cancer '
'at every possible confidence level. '
'The closer the curve is to the top-left corner the better the model is.')
doc.add_paragraph()
body('AUC Score — Area Under the Curve:', bold=True)
body('The AUC score summarises the whole curve in one number. '
'1.0 = perfect. 0.5 = random guessing.')
doc.add_paragraph()
add_table(
['Model', 'AUC Score', 'Meaning'],
[
['Logistic Regression', '0.998', 'Near perfect'],
['SVM', '0.995', 'Excellent'],
['Gradient Boosting', '0.995', 'Excellent'],
['Random Forest', '0.993', 'Excellent'],
['KNN', '0.982', 'Very good'],
['Decision Tree', '0.925', 'Good'],
],
col_widths=[2.5, 1.5, 3.5]
)
doc.add_paragraph()
add_image('chart7_roc_curves.png', width=5.5)
p = doc.add_paragraph('Figure 7 — ROC curves for all 6 models')
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.runs[0].font.italic = True
p.runs[0].font.size = Pt(10)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 12 — STEP 10: WEB APP
# ════════════════════════════════════════════════════════════
heading1('12. Step 10 — Building the Web App')
body('We built a Streamlit web app so that anyone — even someone with no coding knowledge — '
'can use the model. A doctor can type in the measurements and get a result instantly.')
doc.add_paragraph()
heading2('How to Run the App')
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.4)
run = p.add_run('streamlit run breast_cancer_app.py')
run.font.name = 'Courier New'
run.font.size = Pt(11)
run.font.bold = True
run.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
body('Then open your browser at: http://localhost:8501')
doc.add_paragraph()
heading2('App Features')
add_table(
['Tab', 'What It Does'],
[
['Detection',
'Enter 30 cell measurements → click Run Detection → '
'instantly see Cancer or No Cancer with a confidence score. '
'Also shows the 5 most suspicious measurements.'],
['Model Results',
'Full table of all 6 model scores, cross validation results, '
'and all analysis charts.'],
['About the Data',
'Explains the dataset, what each measurement means, '
'and shows all visualisation charts.'],
],
col_widths=[2.0, 5.5]
)
doc.add_paragraph()
heading2('Model Switcher')
body('The sidebar allows switching between three models at any time: '
'SVM, Random Forest, and Logistic Regression. '
'The app reloads instantly with the chosen model — no code changes needed.')
doc.add_paragraph()
body('Disclaimer:', bold=True, color=RGBColor(0xC0, 0x39, 0x2B))
body('This app is for educational purposes only. '
'It must not replace professional medical diagnosis. '
'Always consult a qualified doctor.', italic=True)
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 13 — FINAL RESULTS
# ════════════════════════════════════════════════════════════
heading1('13. Final Results')
heading2('Best Model — SVM (Support Vector Machine)')
add_table(
['Metric', 'Score'],
[
['Accuracy', '97.4%'],
['Precision', '100%'],
['Recall', '92.9%'],
['F1 Score', '96.3%'],
['Cross Validation', '97.5% (±2.0%)'],
['AUC Score', '0.995'],
['Cancer Cases Missed', '3 out of 114 tested'],
['False Alarms', '0 out of 114 tested'],
],
col_widths=[3.5, 3.5]
)
doc.add_paragraph()
heading2('Key Findings')
bullet('Cancer cells are significantly larger than healthy cells — '
'radius, area, and perimeter are all much higher')
bullet('Cancer cells have more irregular and dented shapes — '
'concavity and concave_points are the strongest indicators')
bullet('The worst measurements (largest values seen) predict cancer '
'better than average values')
bullet('All 6 models performed well above 90% because the data has very clear patterns')
bullet('SVM scored the highest cross validation score — '
'it is the most consistent and reliable model')
doc.add_page_break()
# ════════════════════════════════════════════════════════════
# SECTION 14 — CONCLUSION
# ════════════════════════════════════════════════════════════
heading1('14. Conclusion')
body('This project successfully built a machine learning system to detect breast cancer '
'from cell measurements. Starting from a raw dataset of 569 patients we went through '
'every stage of a real data science pipeline.')
doc.add_paragraph()
add_table(
['Step', 'What We Did', 'Result'],
[
['1', 'Explored the data', '569 patients · 30 measurements · 0 missing values'],
['2', 'Cleaned the data', 'Removed ID column · Scaled all numbers fairly'],
['3', 'Visualised the data', '4 charts revealing clear cancer patterns'],
['4', 'Split the data', '455 training patients · 114 unseen test patients'],
['5', 'Trained 6 models', 'All models above 90% accuracy'],
['6', 'Evaluated with 4 metrics', 'Full honest picture of each model'],
['7', 'Cross validation', 'SVM confirmed best at 97.5% average'],
['8', 'Confusion matrix', 'SVM: 0 false alarms · 3 missed cancers'],
['9', 'ROC curve', 'All models AUC above 0.92'],
['10','Built the web app', 'Anyone can use it without coding knowledge'],
],
col_widths=[0.5, 2.5, 4.5]
)
doc.add_paragraph()
body('Final Achievement:', bold=True, color=RGBColor(0x27, 0xAE, 0x60))
body('A complete machine learning pipeline that detects breast cancer with 97.5% accuracy, '
'validated on 114 patients it had never seen before, '
'with a web app that makes it accessible to anyone.',
bold=True)
doc.add_paragraph()
divider()
p = doc.add_paragraph('Data Source: Breast Cancer Wisconsin (Diagnostic) Dataset — University of Wisconsin')
p.runs[0].font.size = Pt(9)
p.runs[0].font.italic = True
p.runs[0].font.color.rgb = RGBColor(0x88, 0x88, 0x88)
# ── SAVE ─────────────────────────────────────────────────────
doc.save('Breast_Cancer_Detection_Documentation.docx')
print('Document saved: Breast_Cancer_Detection_Documentation.docx')