geraldohomero
diff --git a/‎data/transcriptions/transcriptTime.py‎
Lines changed: 403 additions & 0 deletions b/‎data/transcriptions/transcriptTime.py‎
Lines changed: 403 additions & 0 deletions
diff --git a/‎pipeline/canalEspecifico.py‎
Lines changed: 514 additions & 0 deletions b/‎pipeline/canalEspecifico.py‎
Lines changed: 514 additions & 0 deletions
diff --git a/‎pipeline/clean_transcripts.py‎
Lines changed: 34 additions & 25 deletions b/‎pipeline/clean_transcripts.py‎
Lines changed: 34 additions & 25 deletions
diff --git a/‎pipeline/getDataFromSQLite.py‎
Lines changed: 70 additions & 0 deletions b/‎pipeline/getDataFromSQLite.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_alto_Parte1_Nov-Dez_2022.txt‎
Lines changed: 16 additions & 0 deletions b/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_alto_Parte1_Nov-Dez_2022.txt‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_alto_Parte2_7-18_Jan_2023.txt‎
Lines changed: 16 additions & 0 deletions b/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_alto_Parte2_7-18_Jan_2023.txt‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_baixo_Parte1_Nov-Dez_2022.txt‎
Lines changed: 16 additions & 0 deletions b/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_baixo_Parte1_Nov-Dez_2022.txt‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_baixo_Parte2_7-18_Jan_2023.txt‎
Lines changed: 16 additions & 0 deletions b/‎pipeline/output_visDidi/bigramas_Didi_Red_Pill_baixo_Parte2_7-18_Jan_2023.txt‎
Lines changed: 16 additions & 0 deletions
@@ -16,8 +16,8 @@
 # Configuração de diretórios
 # ------------------------------------------------------------
 BASE_DIR = Path(__file__).resolve().parent.parent
-RAW_DB_PATH = BASE_DIR / "db" / "YouTubeStatsPipeline.sqlite3"
-OUTPUT_PATH = BASE_DIR / "data" / "processed" / "transcripts_limpos2Metric.csv"
+RAW_DB_PATH = BASE_DIR / "db" / "YouTubeStatsPipe2.sqlite3"
+OUTPUT_PATH = BASE_DIR / "data" / "processed" / "transcripts_limpos4ComMetric.csv"
 
 # Configuração de memória - otimizada para 32GB RAM
 INITIAL_CHUNK_SIZE = 2000  # Aumentado de 500 para 2000
@@ -238,7 +238,7 @@ def preprocess_text_batch(texts, batch_size: int = 64, n_process: int = N_PROCES
 # ------------------------------------------------------------
 # Pipeline
 # ------------------------------------------------------------
-def process_chunk(chunk, start_date, end_date):
+def process_chunk(chunk):
     """Processa um chunk de dados."""
     # Converte 'publishedAt' para datetime, tratando erros
     chunk['publishedAt'] = pd.to_datetime(chunk['publishedAt'], errors='coerce')
@@ -249,41 +249,37 @@ def process_chunk(chunk, start_date, end_date):
         if metric in chunk.columns:
             chunk[metric] = pd.to_numeric(chunk[metric], errors='coerce')
 
-    # Filtra o DataFrame para o período desejado
-    mask = (chunk['publishedAt'] >= start_date) & (chunk['publishedAt'] < end_date)
-    df_filtrado = chunk[mask].copy()
-    
-    if len(df_filtrado) == 0:
+    if len(chunk) == 0:
         return None
 
-    print(f"Processando chunk com {len(df_filtrado)} registros...")
+    print(f"Processando chunk com {len(chunk)} registros...")
 
     # Lista as colunas presentes para verificação
-    print(f"Colunas disponíveis: {df_filtrado.columns.tolist()}")
+    print(f"Colunas disponíveis: {chunk.columns.tolist()}")
 
     # Verifica se as métricas de engajamento estão presentes
-    metrics_present = [metric for metric in engagement_metrics if metric in df_filtrado.columns]
+    metrics_present = [metric for metric in engagement_metrics if metric in chunk.columns]
     if metrics_present:
         print(f"Métricas de engajamento incluídas: {metrics_present}")
     else:
         print("AVISO: Nenhuma métrica de engajamento encontrada nos dados.")
 
     # Mais otimizações para usar mais memória disponível
-    df_filtrado["cleanTranscript"] = preprocess_text_batch(
-        df_filtrado["videoTranscript"].tolist(),
+    chunk["cleanTranscript"] = preprocess_text_batch(
+        chunk["videoTranscript"].tolist(),
         batch_size=64,
         n_process=N_PROCESS, 
         show_progress=True
     )
 
     # Remove a coluna videoTranscript para economizar espaço
-    df_filtrado = df_filtrado.drop(columns=['videoTranscript'])
+    chunk = chunk.drop(columns=['videoTranscript'])
 
     # Garantir que valores nulos nas métricas sejam substituídos por zeros
     for metric in metrics_present:
-        df_filtrado[metric] = df_filtrado[metric].fillna(0).astype(int)
+        chunk[metric] = chunk[metric].fillna(0).astype(int)
 
-    return df_filtrado
+    return chunk
 
 def main():
     # Add global declaration for CHUNK_SIZE
@@ -353,22 +349,33 @@ def main():
             print(f"Reduzindo para {CHUNK_SIZE} registros por chunk.")
             continue  # Tente novamente com chunk menor
 
-        processed_chunk = process_chunk(chunk, start_date, end_date)
+        processed_chunk = process_chunk(chunk)
 
         # Liberar memória do chunk original imediatamente
         del chunk
         gc.collect()
 
         if processed_chunk is not None and len(processed_chunk) > 0:
-            # Verificar e informar métricas disponíveis
+            # Filtra o DataFrame para o período desejado APÓS o processamento
+            mask = (processed_chunk['publishedAt'] >= start_date) & (processed_chunk['publishedAt'] < end_date)
+            df_filtrado = processed_chunk[mask].copy()
+            
+            if len(df_filtrado) == 0:
+                print("Nenhum registro no período para este chunk.")
+                del processed_chunk
+                del df_filtrado
+                gc.collect()
+                offset += CHUNK_SIZE
+                continue
+            
             engagement_metrics = ['viewCount', 'likeCount', 'commentCount']
-            metrics_present = [metric for metric in engagement_metrics if metric in processed_chunk.columns]
+            metrics_present = [metric for metric in engagement_metrics if metric in df_filtrado.columns]
 
             if metrics_present:
                 print(f"Exportando com métricas de engajamento: {metrics_present}")
                 # Mostrar estatísticas básicas
                 for metric in metrics_present:
-                    print(f"  - {metric}: média = {processed_chunk[metric].mean():.1f}, máx = {processed_chunk[metric].max()}")
+                    print(f"  - {metric}: média = {df_filtrado[metric].mean():.1f}, máx = {df_filtrado[metric].max()}")
             else:
                 print("AVISO: Nenhuma métrica de engajamento será exportada.")
 
@@ -377,9 +384,9 @@ def main():
             header = first_chunk
 
             # Escrever em pedaços maiores para economizar operações de I/O
-            write_chunk_size = min(500, len(processed_chunk))  # Aumentado de 100 para 500
-            for i in range(0, len(processed_chunk), write_chunk_size):
-                sub_df = processed_chunk.iloc[i:i+write_chunk_size]
+            write_chunk_size = min(500, len(df_filtrado))  # Aumentado de 100 para 500
+            for i in range(0, len(df_filtrado), write_chunk_size):
+                sub_df = df_filtrado.iloc[i:i+write_chunk_size]
                 sub_df.to_csv(
                     OUTPUT_PATH, 
                     index=False, 
@@ -395,14 +402,16 @@ def main():
             if first_chunk:
                 first_chunk = False
 
-            total_processed += len(processed_chunk)
-            print(f"Salvos {len(processed_chunk)} registros no arquivo. Total: {total_processed}")
+            total_processed += len(df_filtrado)
+            print(f"Salvos {len(df_filtrado)} registros no arquivo. Total: {total_processed}")
 
         chunks_processed += 1
         offset += CHUNK_SIZE
 
         # Liberar memória do chunk processado explicitamente
         del processed_chunk
+        if 'df_filtrado' in locals():
+            del df_filtrado
         gc.collect()
 
         # Verificar uso de memória e ajustar o tamanho do chunk se necessário
 
@@ -0,0 +1,70 @@
+import sqlite3
+import csv
+import re
+from pathlib import Path
+import spacy
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+RAW_DB_PATH = BASE_DIR / "db" / "YouTubeStatsPipe2.sqlite3"
+OUTPUT_PATH = BASE_DIR / "data" / "processed" / "transcripts_limpos5ComMetric.csv"
+
+# Carrega o modelo spaCy para português
+nlp = spacy.load("pt_core_news_sm")
+
+def clean_transcript(text):
+    if not text:
+        return ""
+    # Remove timestamps (format: [00:00], [12:34], [1:23:45], etc.)
+    text = re.sub(r'\[\d{1,2}:\d{2}(?::\d{2})?\]', '', text)
+    # Remove outros formatos de timestamp (ex: 00:00, 0:00:00, etc.)
+    text = re.sub(r'\b\d{1,2}:\d{2}(?::\d{2})?\b', '', text)
+    # Remove line breaks
+    text = text.replace('\n', ' ').replace('\r', ' ')
+    # Remove extra spaces
+    text = re.sub(r'\s+', ' ', text).strip()
+    # Limpeza e normalização com spaCy
+    doc = nlp(text)
+    # Mantém apenas tokens alfabéticos, lematizados e não stopwords
+    cleaned = " ".join([token.lemma_ for token in doc if token.is_alpha and not token.is_stop])
+    return cleaned
+
+def export_videos_to_csv():
+    conn = sqlite3.connect(RAW_DB_PATH)
+    cursor = conn.cursor()
+    query = """
+        SELECT
+            videoId,
+            channelId,
+            videoTitle,
+            videoTranscript,
+            publishedAt,
+            transcriptLanguage,
+            viewCount,
+            likeCount,
+            commentCount
+        FROM Videos
+        WHERE publishedAt >= '2022-10-31 00:00:00'
+          AND publishedAt < '2023-04-01 00:00:00'
+    """
+    cursor.execute(query)
+    rows = cursor.fetchall()
+    headers = [desc[0] for desc in cursor.description]
+
+    # Clean transcripts
+    cleaned_rows = []
+    for row in rows:
+        row = list(row)
+        # videoTranscript is at index 3
+        row[3] = clean_transcript(row[3])
+        cleaned_rows.append(row)
+
+    OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(OUTPUT_PATH, "w", newline='', encoding='utf-8') as f:
+        writer = csv.writer(f)
+        writer.writerow(headers)
+        writer.writerows(cleaned_rows)
+
+    conn.close()
+
+if __name__ == "__main__":
+    export_videos_to_csv()
@@ -0,0 +1,16 @@
+Bigrama	Frequência
+forças armadas	951
+todo mundo	869
+presidente bolsonaro	864
+alexandre moraes	582
+certo então	575
+artigo 142	489
+vou mostrar	464
+daqui pouco	445
+mil pessoas	417
+alguma coisa	365
+vamos vamos	360
+porta quartel	347
+boa noite	332
+esqueça dar	326
+tamo junto	323
@@ -0,0 +1,16 @@
+Bigrama	Frequência
+quebra quebra	7
+trabalho jornalístico	5
+todo mundo	5
+qualquer tipo	4
+pingos is	3
+nada disso	3
+certo então	3
+tava acontecendo	3
+velha mídia	3
+volto repetir	3
+hora dessas	3
+então vamos	2
+então tô	2
+beijo coração	2
+coração cada	2
@@ -0,0 +1,16 @@
+Bigrama	Frequência
+nesse vídeo	62
+deixa comentários	45
+presidente bolsonaro	40
+todo mundo	29
+aplausos aplausos	27
+forças armadas	27
+vou mostrar	21
+olha vídeo	19
+ontem noite	16
+boa tarde	16
+vídeo deixa	15
+olhadinha nesse	14
+vídeo vai	14
+comentários achou	14
+vai agora	14
@@ -0,0 +1,16 @@
+Bigrama	Frequência
+presidente bolsonaro	2
+fala certeza	1
+certeza conhecem	1
+conhecem senadora	1
+senadora soraia	1
+soraia tronic	1
+tronic traiu	1
+traiu presidente	1
+bolsonaro candidata	1
+candidata presidenta	1
+presidenta contra	1
+contra próprio	1
+próprio presidente	1
+presidente mulherzinha	1
+mulherzinha mostrou	1