update tests and logic

jdkent · jdkent · commit b0718af7cc1c · 2024-11-07T17:03:31.000-06:00
diff --git a/store/neurostore/resources/base.py b/store/neurostore/resources/base.py
@@ -8,6 +8,8 @@
 from flask import abort, request, current_app  # jsonify
 from flask.views import MethodView
 
+from psycopg2 import errors
+
 import sqlalchemy as sa
 import sqlalchemy.sql.expression as sae
 from sqlalchemy.orm import (
@@ -21,7 +23,7 @@
 
 from ..core import cache
 from ..database import db
-from .utils import get_current_user, validate_search_query, search_to_tsquery
+from .utils import get_current_user, validate_search_query, pubmed_to_tsquery
 from ..models import (
     StudysetStudy,
     AnnotationAnalysis,
@@ -613,10 +615,11 @@ def search(self):
         if s is not None and s.isdigit():
             q = q.filter_by(pmid=s)
         elif s is not None and self._fulltext_fields:
-            valid = validate_search_query(s)
-            if not valid:
-                abort(400, description=valid)
-            tsquery = search_to_tsquery(s)
+            try:
+                validate_search_query(s)
+            except errors.SyntaxError as e:
+                abort(400, description=e.args[0])
+            tsquery = pubmed_to_tsquery(s)
             q = q.filter(m._ts_vector.op("@@")(tsquery))
 
         # Alternatively (or in addition), search on individual fields.
diff --git a/store/neurostore/resources/utils.py b/store/neurostore/resources/utils.py
@@ -5,6 +5,7 @@
 import re
 
 from connexion.context import context
+from psycopg2 import errors
 
 from .. import models
 from .. import schemas
@@ -58,11 +59,11 @@ def validate_search_query(query: str) -> bool:
     """
     # Check for valid parentheses
     if not validate_parentheses(query):
-        return 'Unmatched parentheses'
+        raise errors.SyntaxError("Unmatched parentheses")
 
     # Check for valid query end
     if not validate_query_end(query):
-        return 'Query cannot end with an operator'
+        raise errors.SyntaxError("Query cannot end with an operator")
 
     return True
 
@@ -79,27 +80,27 @@ def validate_parentheses(query: str) -> bool:
     """
     stack = []
     for char in query:
-        if char == '(':
+        if char == "(":
             stack.append(char)
-        elif char == ')':
+        elif char == ")":
             if not stack:
                 return False  # Unmatched closing parenthesis
             stack.pop()
     return not stack  # Ensure all opening parentheses are closed
 
 
 def validate_query_end(query: str) -> bool:
-    """ Query should not end with an operator """
-    operators = ('AND', 'OR', 'NOT')
+    """Query should not end with an operator"""
+    operators = ("AND", "OR", "NOT")
 
-    if query.strip().split(' ')[-1] in operators:
+    if query.strip().split(" ")[-1] in operators:
         return False
     return True
 
 
 def count_chars(target, query: str) -> int:
-    """ Count the number of chars in a query string.
-     Excluding those in quoted phrases."""
+    """Count the number of chars in a query string.
+    Excluding those in quoted phrases."""
     count = 0
     in_quotes = False
     for char in query:
@@ -112,11 +113,11 @@ def count_chars(target, query: str) -> int:
 
 def pubmed_to_tsquery(query: str) -> str:
     """
-    Convert a PubMed-like search query to PostgreSQL tsquery format, 
-    grouping both single-quoted and double-quoted text with the <-> operator 
+    Convert a PubMed-like search query to PostgreSQL tsquery format,
+    grouping both single-quoted and double-quoted text with the <-> operator
     for proximity search.
 
-    Additionally, automatically adds & between non-explicitly connected terms 
+    Additionally, automatically adds & between non-explicitly connected terms
     and handles NOT terms.
 
     Args:
@@ -130,7 +131,7 @@ def pubmed_to_tsquery(query: str) -> str:
 
     # Step 1: Split into tokens (preserving quoted phrases)
     # Regex pattern: match quoted phrases or non-space sequences
-    tokens = re.findall( r'"[^"]*"|\'[^\']*\'|\S+', query)
+    tokens = re.findall(r'"[^"]*"|\'[^\']*\'|\S+', query)
 
     # Step 2: Combine tokens in parantheses into single tokens
     def combine_parentheses(tokens: list) -> list:
@@ -152,19 +153,19 @@ def combine_parentheses(tokens: list) -> list:
                 buffer.append(token)
 
                 # Adjust the count of parentheses
-                paren_count += count_chars('(', token) - count_chars(')', token)
+                paren_count += count_chars("(", token) - count_chars(")", token)
 
                 if paren_count < 1:
                     # Combine all tokens in parentheses
-                    combined_tokens.append(' '.join(buffer))
+                    combined_tokens.append(" ".join(buffer))
                     buffer = []  # Clear the buffer
                     paren_count = 0
 
             else:
-                n_paren = count_chars('(', token) - count_chars(')', token)
+                n_paren = count_chars("(", token) - count_chars(")", token)
                 # If not in parentheses, but token contains opening parentheses
                 # Start capturing tokens inside parentheses
-                if token[0] == '(' and n_paren > 0:
+                if token[0] == "(" and n_paren > 0:
                     paren_count += n_paren
                     buffer.append(token)  # Start capturing tokens in parens
                     print(buffer)
@@ -174,7 +175,7 @@ def combine_parentheses(tokens: list) -> list:
         # If the list ends without a closing parenthesis (invalid input)
         # append buffer contents (fallback)
         if buffer:
-            combined_tokens.append(' '.join(buffer))
+            combined_tokens.append(" ".join(buffer))
 
         return combined_tokens
 
@@ -184,42 +185,43 @@ def combine_parentheses(tokens: list) -> list:
         if token[0] == "(" and token[-1] == ")":
             # RECURSIVE: Process the contents of the parentheses
             token_res = pubmed_to_tsquery(token[1:-1])
-            token = '(' + token_res + ')'
+            token = "(" + token_res + ")"
             tokens[i] = token
 
         # Step 4: Handle both single-quoted and double-quoted phrases,
         # grouping them with <-> (proximity operator)
         elif token[0] in ('"', "'"):
             # Split quoted text into individual words and join with <-> for
             # proximity search
-            words = re.findall(r'\w+', token)
-            tokens[i] = '<->'.join(words)
+            words = re.findall(r"\w+", token)
+            tokens[i] = "<->".join(words)
 
         # Step 3: Replace logical operators AND, OR, NOT
         else:
-            if token == 'AND':
-                tokens[i] = '&'
-            elif token == 'OR':
-                tokens[i] = '|'
-            elif token == 'NOT':
-                tokens[i] = '&!'
+            if token == "AND":
+                tokens[i] = "&"
+            elif token == "OR":
+                tokens[i] = "|"
+            elif token == "NOT":
+                tokens[i] = "&!"
 
     processed_tokens = []
     last_token = None
     for token in tokens:
         # Step 5: Add & between consecutive terms that aren't already
         # connected by an operator
         stripped_token = token.strip()
-
-        if stripped_token == '':
+        if stripped_token not in ("&", "|", "!", "&!"):
+            stripped_token = re.sub(r"[\[\],;:!?@#]", "", stripped_token)
+        if stripped_token == "":
             continue  # Ignore empty tokens from splitting
 
-        if last_token and last_token not in ('&', '|', '!', '&!'):
-            if stripped_token not in ('&', '|', '!', '&!'):
+        if last_token and last_token not in ("&", "|", "!", "&!"):
+            if stripped_token not in ("&", "|", "!", "&!"):
                 # Insert an implicit AND (&) between two non-operator tokens
-                processed_tokens.append('&')
+                processed_tokens.append("&")
 
         processed_tokens.append(stripped_token)
         last_token = stripped_token
 
-    return ' '.join(processed_tokens)
+    return " ".join(processed_tokens)
diff --git a/store/neurostore/tests/api/test_query_params.py b/store/neurostore/tests/api/test_query_params.py
@@ -1,6 +1,7 @@
 import pytest
 from ...models import Study
 from ...schemas.data import StudysetSchema, StudySchema, AnalysisSchema, StringOrNested
+from ..conftest import valid_queries, invalid_queries
 
 
 @pytest.mark.parametrize("nested", ["true", "false"])
@@ -99,3 +100,17 @@ def test_multiword_queries(auth_client, ingest_neurosynth, session):
 
     multi_word_search = auth_client.get(f"/api/studies/?search={multiple_words}")
     assert multi_word_search.status_code == 200
+
+
+@pytest.mark.parametrize("query, expected", valid_queries)
+def test_valid_pubmed_queries(query, expected, auth_client, ingest_neurosynth, session):
+    search = auth_client.get(f"/api/studies/?search={query}")
+    assert search.status_code == 200
+
+
+@pytest.mark.parametrize("query, expected", invalid_queries)
+def test_invalid_pubmed_queries(
+    query, expected, auth_client, ingest_neurosynth, session
+):
+    search = auth_client.get(f"/api/studies/?search={query}")
+    assert search.status_code == 400
diff --git a/store/neurostore/tests/conftest.py b/store/neurostore/tests/conftest.py
@@ -586,3 +586,59 @@ def simple_neurosynth_annotation(session, ingest_neurosynth):
     session.commit()
 
     return smol_annot
+
+
+"""
+Queries for testing
+"""
+invalid_queries = [
+    (
+        '("autism" OR "ASD" OR "autistic") AND (("decision*" OR "choice*" ',
+        "Unmatched parentheses",
+    ),
+    ('"autism" OR "ASD" OR "autistic" OR ', "Query cannot end with an operator"),
+    (
+        '(("Autism Spectrum Disorder" OR "autism spectrum disorder") OR ("Autism" OR "autism") OR ("ASD")) AND (("decision*" OR "Dec',
+        "Unmatched parentheses",
+    ),
+]
+
+valid_queries = [
+    (
+        '"Mild Cognitive Impairment" or "Early Cognitive Decline" or "Pre-Dementia" or "Mild Neurocognitive Disorder"',
+        "MILD<->COGNITIVE<->IMPAIRMENT | EARLY<->COGNITIVE<->DECLINE | PRE<->DEMENTIA | MILD<->NEUROCOGNITIVE<->DISORDER",
+    ),
+    (
+        '("autism" OR "ASD" OR "autistic") AND ("decision" OR "choice")',
+        "(AUTISM | ASD | AUTISTIC) & (DECISION | CHOICE)",
+    ),
+    (
+        "stroop and depression or back and depression or go",
+        "STROOP & DEPRESSION | BACK & DEPRESSION | GO",
+    ),
+    (
+        '("autism" OR "ASD" OR "autistic") AND (("decision" OR "decision-making" OR "choice" OR "selection" OR "option" OR "value") OR ("feedback" OR "feedback-related" OR "reward" OR "error" OR "outcome" OR "punishment" OR "reinforcement"))',
+        "(AUTISM | ASD | AUTISTIC) & ((DECISION | DECISION<->MAKING | CHOICE | SELECTION | OPTION | VALUE) | (FEEDBACK | FEEDBACK<->RELATED | REWARD | ERROR | OUTCOME | PUNISHMENT | REINFORCEMENT))",
+    ),
+    (
+        '"dyslexia" or "Reading Disorder" or "Language-Based Learning Disability" or "Phonological Processing Disorder" or "Word Blindness"',
+        "DYSLEXIA | READING<->DISORDER | LANGUAGE<->BASED<->LEARNING<->DISABILITY | PHONOLOGICAL<->PROCESSING<->DISORDER | WORD<->BLINDNESS",
+    ),
+    ("emotion and pain -physical -touch", "EMOTION & PAIN & -PHYSICAL & -TOUCH"),
+    (
+        '("Schizophrenia"[Mesh] OR schizophrenia )',
+        "(SCHIZOPHRENIA & MESH | SCHIZOPHRENIA)",
+    ),
+    ("Bipolar Disorder", "BIPOLAR & DISORDER"),
+    ('"quchi" or "LI11"', "QUCHI | LI11"),
+    ('"rubber hand illusion"', "RUBBER<->HAND<->ILLUSION"),
+]
+
+weird_queries = [
+    (
+        "[Major Depressive Disorder (MDD)] or [Clinical Depression] or [Unipolar Depression]",
+        "MAJOR & DEPRESSIVE & DISORDER & (MDD) | CLINICAL & DEPRESSION | UNIPOLAR & DEPRESSION",
+    ),
+]
+
+validate_queries = invalid_queries + [(q, True) for q, _ in valid_queries]
diff --git a/store/neurostore/tests/test_utils.py b/store/neurostore/tests/test_utils.py
@@ -1,54 +1,23 @@
 import pytest
 
-from ..utils import search_to_tsquery, validate_search_query
-
-
-invalid_queries = [
-    ('("autism" OR "ASD" OR "autistic") AND (("decision*" OR "choice*" ', 'Unmatched parentheses'),
-    ('"autism" OR "ASD" OR "autistic" OR ', 'Query cannot end with an operator'),
-    ('(("Autism Spectrum Disorder" OR "autism spectrum disorder") OR ("Autism" OR "autism") OR ("ASD")) AND (("decision*" OR "Dec', 'Unmatched parentheses')
-]
-
-valid_queries = [
-    ('"Mild Cognitive Impairment" or "Early Cognitive Decline" or "Pre-Dementia" or "Mild Neurocognitive Disorder"', 
-     'MILD<->COGNITIVE<->IMPAIRMENT | EARLY<->COGNITIVE<->DECLINE | PRE<->DEMENTIA | MILD<->NEUROCOGNITIVE<->DISORDER'),
-    ('("autism" OR "ASD" OR "autistic") AND ("decision" OR "choice")',
-     '(AUTISM | ASD | AUTISTIC) & (DECISION | CHOICE)'),
-    ('stroop and depression or back and depression or go',
-     'STROOP & DEPRESSION | BACK & DEPRESSION | GO'),
-    ('("autism" OR "ASD" OR "autistic") AND (("decision" OR "decision-making" OR "choice" OR "selection" OR "option" OR "value") OR ("feedback" OR "feedback-related" OR "reward" OR "error" OR "outcome" OR "punishment" OR "reinforcement"))',
-     '(AUTISM | ASD | AUTISTIC) & ((DECISION | DECISION<->MAKING | CHOICE | SELECTION | OPTION | VALUE) | (FEEDBACK | FEEDBACK<->RELATED | REWARD | ERROR | OUTCOME | PUNISHMENT | REINFORCEMENT))'),
-    ('"dyslexia" or "Reading Disorder" or "Language-Based Learning Disability" or "Phonological Processing Disorder" or "Word Blindness"',
-     'DYSLEXIA | READING<->DISORDER | LANGUAGE<->BASED<->LEARNING<->DISABILITY | PHONOLOGICAL<->PROCESSING<->DISORDER | WORD<->BLINDNESS'),
-    ('emotion and pain -physical -touch',
-     'EMOTION & PAIN & -PHYSICAL & -TOUCH'),
-    ('("Schizophrenia"[Mesh] OR schizophrenia )',
-     '(SCHIZOPHRENIA & [MESH] | SCHIZOPHRENIA)')
-    ('Bipolar Disorder',
-     'BIPOLAR & DISORDER'),
-    ('"quchi" or "LI11"',
-     'QUCHI | LI11'),
-    ('"rubber hand illusion"',
-     'RUBBER<->HAND<->ILLUSION'),
-]
-
-error_queries = [
-    "[Major Depressive Disorder (MDD)] or [Clinical Depression] or [Unipolar Depression]"
-]
-
-validate_queries = invalid_queries + [(q, True) for q, _ in valid_queries]
+from ..resources.utils import pubmed_to_tsquery, validate_search_query
+from .conftest import valid_queries, validate_queries, weird_queries
 
 
 @pytest.mark.parametrize("query, expected", valid_queries)
-def test_search_to_tsquery(query, expected):
-    assert search_to_tsquery(query) == expected
+def test_pubmed_to_tsquery(query, expected):
+    assert pubmed_to_tsquery(query) == expected
 
 
-@pytest.mark.parametrize("query, expected", invalid_queries)
+@pytest.mark.parametrize("query, expected", validate_queries)
 def test_validate_search_query(query, expected):
-    assert validate_search_query(query) == expected
+    if expected is True:
+        assert validate_search_query(query) == expected
+    else:
+        with pytest.raises(Exception):
+            validate_search_query(query)
+
 
-@pytest.mark.parametrize("query", error_queries)
-def test_search_to_tsquery_error(query):
-    with pytest.raises(ValueError):
-        search_to_tsquery(query)
+@pytest.mark.parametrize("query, expected", weird_queries)
+def test_pubmed_to_tsquery_weird(query, expected):
+    assert pubmed_to_tsquery(query) == expected