Skip to content

Maybe One‐Off commands

James Kent edited this page Mar 20, 2025 · 18 revisions

Find base_studies with the same doi

same_dois = BaseStudy.query.group_by(BaseStudy.doi).having(func.count(BaseStudy.doi) > 1).with_entities(BaseStudy.doi, func.array_agg(BaseStudy.id).label('id_list')).all()
from sqlalchemy import or_

# Collect all IDs to batch query
all_ids = [id_ for doi in same_dois if len(doi[1]) == 2 for id_ in doi[1]]
id_to_study = {s.id: s for s in BaseStudy.query.filter(BaseStudy.id.in_(all_ids)).all()}

keep = []
delete = []

for doi in same_dois:
    if len(doi[1]) != 2:
        continue

    first_id, second_id = doi[1]
    first = id_to_study[first_id]
    second = id_to_study[second_id]

    # Determine priority based on number of identifiers
    first_id_count = sum(bool(getattr(first, attr)) for attr in ["doi", "pmid", "pmcid"])
    second_id_count = sum(bool(getattr(second, attr)) for attr in ["doi", "pmid", "pmcid"])

    if second_id_count > first_id_count:
        first, second = second, first  # Swap so that first is always the higher priority study

    # Merge identifiers (priority study retains its values unless missing)
    first.pmid = first.pmid or second.pmid
    first.pmcid = first.pmcid or second.pmcid
    first.name = first.name or second.name
    first.description = first.description or second.description
    first.year = first.year or second.year
    first.publication = first.publication or second.publication
    first.metadata_ = first.metadata_ or second.metadata_

    # Merge versions/studies
    first.versions.extend(second.versions)

    # Add to lists for keeping and deleting
    keep.append(first)
    delete.append(second)

    # Optional logging
    print(f"doi: {doi[0]}")
    print(f"name: {first.name}")
    print(f"pmid: {first.pmid}")
    print(f"pmcid: {first.pmcid}")
    print(f"studies: {len(first.versions)} versions merged")
    print(f"first source: {first.versions[0].source if first.versions else 'No source'}")

# Optional: Delete duplicates after processing
#for study in delete:
#    db.session.delete(study)

# Commit the changes
# db.session.commit()

Divide combined studies

dup_bs = []
for bs in BaseStudy.query.options(joinedload(BaseStudy.versions)):
    if len(bs.versions) < 1:
        continue
    pmid = bs.versions[0].pmid
    for s in bs.versions[1:]:
        if not pmid:
            pmid = s.pmid
        if s.pmid and s.pmid != pmid:
            if bs not in dup_bs:
                dup_bs.append(bs)
            print(bs.id)
new_bs = []
for bs in dup_bs:
    orig_pmid = bs.pmid
    groups = {}
    for v in bs.versions:
        if v.pmid == orig_pmid:
            continue
        if v.pmid in groups:
            groups[v.pmid].append(v)
        else:
            groups[v.pmid] = [v]
    for pmid, studies in groups.items():
        new_bs.append(BaseStudy(
            name=next((x.name for x in studies if x.name), None),
            pmid=pmid,
            doi=next((x.doi for x in studies if x.doi), None),
            authors=next((x.authors for x in studies if x.authors), None),
            year=next((x.year for x in studies if x.year), None),
            description=next((x.description for x in studies if x.description), None),
            publication=next((x.publication for x in studies if x.publication), None),
            metadata_=next((x.metadata for x in studies if x.metadata_), None),
            level="group",
            public=True,
            versions=studies,
       ))

db.session.add_all(new_bs)
db.session.commit()

Update provenance

from sqlalchemy.orm.attributes import flag_modified


to_commit = []
for p in Project.query:
    if not p.provenance.get("extractionMetadata"):
        continue
    for ss in p.provenance['extractionMetadata']['studyStatusList']: 
        if ss['status'] == "COMPLETE":
            ss['status'] = 'completed'
        elif ss['status'] == 'SAVEFORLATER':
            ss['status'] = 'savedforlater'
   
    flag_modified(p, "provenance")
    to_commit.append(p)

db.session.add_all(to_commit)
db.session.commit()

Add order to existing analysis objects

from sqlalchemy.orm import joinedload

def order_objects(objects):
    # Check if all objects have table_id defined
    if all(hasattr(obj, 'table_id') and obj.table_id is not None for obj in objects):
        # Check if all table_ids can be represented as integers
        if all(obj.table_id.isdigit() for obj in objects):
            # Convert table_ids to integers and sort based on value
            objects.sort(key=lambda obj: int(obj.table_id))
        else:
            # Sort based on string comparison of table_ids
            objects.sort(key=lambda obj: obj.table_id)
    else:
        # Check if all names can be represented as integers
        if all(obj.name.isdigit() for obj in objects):
            # Convert names to integers and sort based on value
            objects.sort(key=lambda obj: int(obj.name))
        else:
            # Sort based on string comparison of names
            objects.sort(key=lambda obj: obj.name)
    
    # Assign order attribute to each object
    for i, obj in enumerate(objects):
        obj.order = i

studies = Study.query.options(joinedload(Study.analyses)).all()

handle studies with doi, but no pmid

doi_no_pmid = BaseStudy.query.filter(and_(BaseStudy.doi != None, BaseStudy.doi != '')).filter(or_(BaseStudy.pmid == None, BaseStudy.doi == '')).all()

to_commit = []
to_delete = []
attribute_names = [column.name for column in BaseStudy.__table__.columns]
for bs in doi_no_pmid:
    pmids = doi_to_pmid(bs.doi)
    if len(pmids) == 1:
        pmid = pmids[0]
        other_bs = BaseStudy.query.filter_by(doi=bs.doi, pmid=pmid).one_or_none()
        if other_bs:
            for attr in ['name', 'description', 'metadata', 'publication', 'authors', 'year', 'level']:
                setattr(other_bs, attr, getattr(other_bs, attr, getattr(bs, attr, None)))
            other_bs.versions.extend(bs.versions)
            to_delete.append(bs)
            bs = other_bs
        else:
            bs.pmid = pmid
        to_commit.append(bs)
        for v in bs.versions:
            v.pmid = v.pmid or pmid
            to_commit.append(v)
    print(bs.name)

Clone this wiki locally