Skip to content

Commit 2c88634

Browse files
committed
Updated output_path, added exist check, used prompt.looks_like_vid_type() filtering, added strip emoji from keys
1 parent f715cd9 commit 2c88634

1 file changed

Lines changed: 15 additions & 3 deletions

File tree

utils/update-personas-from-prompts-csv.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from python.utils.lib import data, log
88

99
prompts_csv_url = 'https://huggingface.co/datasets/fka/prompts.chat/raw/main/prompts.csv'
10-
output_path = Path(__file__).parent.parent.parent / 'data/ai-personas.json'
10+
output_path = Path(__file__).parent.parent / 'data/ai-personas.json'
1111

1212
log.info(f'Downloading {prompts_csv_url}...')
1313
csv.field_size_limit(10**9) # to accommodate longass prompts
@@ -22,20 +22,32 @@
2222
row for row in prompt_rows
2323
if row.get('type') == 'TEXT'
2424
and not prompt.looks_like_img_type(row.get('prompt', ''))
25+
and not prompt.looks_like_vid_type(row.get('prompt', ''))
2526
and row.get('act', '').strip().lower() != 'test'
2627
and (row_lower := row['act'].strip().lower()) not in seen_personas
2728
and not seen_personas.add(row_lower)
2829
]
2930
log.success(f'{len(text_prompt_rows):,} text prompts found!')
3031

3132
log.info(f'Reading {output_path}...')
32-
personas = data.json.read(output_path) if output_path.exists() else {}
33+
if not output_path.exists():
34+
log.error(f'Output path does not exist: {output_path}')
35+
raise SystemExit(1)
36+
personas = data.json.read(output_path)
3337
log.success(f'{len(personas):,} previous personas loaded!')
3438

3539
log.info('Adding new personas...')
3640
added_cnt = 0
41+
emoji_re = re.compile(
42+
'['
43+
'\U0001F300-\U0001FAFF' # symbols/emoji
44+
'\U00002700-\U000027BF' # dingbats
45+
'\U0001F1E0-\U0001F1FF' # flags
46+
']+',
47+
flags=re.UNICODE
48+
)
3749
for row in text_prompt_rows:
38-
role = re.sub(r'^# |["“”‘’]', '', row['act']).strip()
50+
role = re.sub(r'^# |["“”‘’]', '', emoji_re.sub('', row['act'])).strip()
3951
persona = {'prompt': row['prompt'].strip()}
4052
if row.get('for_devs', '').strip().upper() == 'TRUE':
4153
persona['targetAudience'] = ['devs']

0 commit comments

Comments
 (0)