Skip to content

Commit 13ecbcf

Browse files
No pid provider, adiciona deprecated_sps_pkg_name para identificar pacotes registrados e melhora a identificação e exclusão de duplicados (#1256)
* build: atualiza packtools para versão 4.14.0 * feat(pid_provider): adiciona sps_pkg_name e deprecated_sps_pkg_name na query - Adiciona cached_property sps_pkg_name e deprecated_sps_pkg_name - Expande busca por pkg_name para incluir todas as variantes * feat(pid_provider): refatora deduplicação - Adiciona find_duplicated_v2 para buscar duplicatas por pid v2 - Refatora deduplicate_items para suportar mark_as_duplicated e deduplicate - Renomeia fix_duplicated_pkg_name para fix_duplicated_items - fix_duplicated_items agora busca por pkg_name, v2 ou other_pid * refactor(pid_provider): simplifica chamada de deduplicate_items na task - Unifica mark_as_duplicated e deduplicate em única chamada * feat(article): adiciona deduplicação por pid_v2 e refatora métodos - Adiciona find_duplicated_pid_v2 para buscar duplicatas por pid v2 - Refatora deduplicate_items para suportar mark_as_duplicated e deduplicate - Remove mark_items_as_duplicated (funcionalidade incorporada em deduplicate_items) - Renomeia fix_duplicated_pkg_name para fix_duplicated_items - find_duplicated_pkg_names retorna QuerySet com values_list * refactor(article): simplifica chamada de deduplicate_items na task - Unifica mark_as_duplicated e deduplicate em única chamada * Update article/models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Corrige a expressão de busca --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
1 parent 56c96a2 commit 13ecbcf

6 files changed

Lines changed: 129 additions & 63 deletions

File tree

article/models.py

Lines changed: 52 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -930,56 +930,76 @@ def find_duplicated_pkg_names(cls, journal=None, journal_id=None):
930930
params["journal"] = journal
931931
if journal_id:
932932
params["journal__id"] = journal_id
933-
duplicates = (
933+
return (
934934
cls.objects.filter(**params)
935935
.exclude(sps_pkg_name__isnull=True)
936936
.exclude(sps_pkg_name="")
937937
.exclude(data_status=choices.DATA_STATUS_DUPLICATED)
938938
.values("sps_pkg_name")
939939
.annotate(count=Count("id"))
940940
.filter(count__gt=1)
941+
.values_list("sps_pkg_name", flat=True)
942+
)
943+
944+
@classmethod
945+
def find_duplicated_pid_v2(cls, journal=None, journal_id=None):
946+
# Busca em ambos os campos de ISSN
947+
params = {}
948+
if journal:
949+
params["journal"] = journal
950+
if journal_id:
951+
params["journal__id"] = journal_id
952+
return (
953+
cls.objects.filter(**params)
954+
.exclude(pid_v2__isnull=True)
955+
.exclude(pid_v2="")
956+
.exclude(data_status=choices.DATA_STATUS_DUPLICATED)
957+
.values("pid_v2")
958+
.annotate(count=Count("id"))
959+
.filter(count__gt=1)
960+
.values_list("pid_v2", flat=True)
941961
)
942-
return list(item["sps_pkg_name"] for item in duplicates)
943962

944963
@classmethod
945-
def mark_items_as_duplicated(cls, journal=None, journal_id=None):
964+
def deduplicate_items(cls, user, journal=None, journal_id=None, mark_as_duplicated=False, deduplicate=False):
946965
"""
947966
Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos.
948967
949968
Args:
950969
issns: Lista de ISSNs para verificar duplicatas.
951970
user: Usuário que está executando a operação.
952971
"""
953-
article_duplicated_pkg_names = cls.find_duplicated_pkg_names(
972+
article_duplicated_pid_v2 = cls.find_duplicated_pid_v2(
954973
journal, journal_id
955974
)
956-
if not article_duplicated_pkg_names:
957-
return
958-
cls.objects.filter(sps_pkg_name__in=article_duplicated_pkg_names).exclude(
959-
data_status=choices.DATA_STATUS_DUPLICATED
960-
).update(
961-
data_status=choices.DATA_STATUS_DUPLICATED,
962-
)
963-
return article_duplicated_pkg_names
964-
965-
@classmethod
966-
def deduplicate_items(cls, user, journal=None, journal_id=None):
967-
"""
968-
Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos.
975+
if article_duplicated_pid_v2.exists():
976+
if mark_as_duplicated:
977+
cls.objects.filter(pid_v2__in=article_duplicated_pid_v2).exclude(
978+
data_status=choices.DATA_STATUS_DUPLICATED
979+
).update(
980+
data_status=choices.DATA_STATUS_DUPLICATED,
981+
)
982+
if deduplicate:
983+
for pid_v2 in article_duplicated_pid_v2:
984+
cls.fix_duplicated_items(user, None, pid_v2)
969985

970-
Args:
971-
issns: Lista de ISSNs para verificar duplicatas.
972-
user: Usuário que está executando a operação.
973-
"""
974986
article_duplicated_pkg_names = cls.find_duplicated_pkg_names(
975987
journal, journal_id
976988
)
977-
for pkg_name in article_duplicated_pkg_names:
978-
cls.fix_duplicated_pkg_name(pkg_name, user)
989+
if article_duplicated_pkg_names.exists():
990+
if mark_as_duplicated:
991+
cls.objects.filter(sps_pkg_name__in=article_duplicated_pkg_names).exclude(
992+
data_status=choices.DATA_STATUS_DUPLICATED
993+
).update(
994+
data_status=choices.DATA_STATUS_DUPLICATED,
995+
)
996+
if deduplicate:
997+
for pkg_name in article_duplicated_pkg_names:
998+
cls.fix_duplicated_items(user, pkg_name, None)
979999
return article_duplicated_pkg_names
9801000

9811001
@classmethod
982-
def fix_duplicated_pkg_name(cls, pkg_name, user):
1002+
def fix_duplicated_items(cls, user, pkg_name, pid_v2):
9831003
"""
9841004
Corrige artigos marcados como DATA_STATUS_DUPLICATED com base no pkg_name fornecido.
9851005
@@ -991,7 +1011,12 @@ def fix_duplicated_pkg_name(cls, pkg_name, user):
9911011
int: Número de artigos atualizados.
9921012
"""
9931013
try:
994-
articles = cls.objects.filter(sps_pkg_name=pkg_name).exclude(
1014+
filters = Q()
1015+
if pkg_name:
1016+
filters |= Q(sps_pkg_name=pkg_name)
1017+
if pid_v2:
1018+
filters |= Q(pid_v2=pid_v2)
1019+
articles = cls.objects.filter(filters).exclude(
9951020
data_status=choices.DATA_STATUS_DUPLICATED
9961021
)
9971022
if articles.count() <= 1:
@@ -1014,8 +1039,8 @@ def fix_duplicated_pkg_name(cls, pkg_name, user):
10141039
UnexpectedEvent.create(
10151040
exception=exception,
10161041
exc_traceback=exc_traceback,
1017-
action="article.models.Article.fix_duplicated_pkg_name",
1018-
detail=pkg_name,
1042+
action="article.models.Article.fix_duplicated_items",
1043+
detail=pkg_name or pid_v2,
10191044
)
10201045

10211046

article/tasks.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,11 +1063,8 @@ def task_fix_journal_articles_status(
10631063
if mark_as_public:
10641064
Article.mark_items_as_public(journal_id=journal_id)
10651065

1066-
if mark_as_duplicated:
1067-
Article.mark_items_as_duplicated(journal_id=journal_id)
1068-
1069-
if deduplicate:
1070-
Article.deduplicate_items(user, journal_id=journal_id)
1066+
if mark_as_duplicated or deduplicate:
1067+
Article.deduplicate_items(user, journal_id=journal_id, mark_as_duplicated=mark_as_duplicated, deduplicate=deduplicate)
10711068

10721069
return {
10731070
"status": "success",

pid_provider/models.py

Lines changed: 52 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1315,7 +1315,7 @@ def mark_items_as_invalid(cls, issns):
13151315
@profile_classmethod
13161316
def find_duplicated_pkg_names(cls, issns):
13171317
# Busca em ambos os campos de ISSN
1318-
duplicates = (
1318+
return (
13191319
cls.objects.filter(Q(issn_print__in=issns) | Q(issn_electronic__in=issns))
13201320
.exclude(pkg_name__isnull=True)
13211321
.exclude(pkg_name="")
@@ -1328,52 +1328,82 @@ def find_duplicated_pkg_names(cls, issns):
13281328
.values("pkg_name")
13291329
.annotate(count=Count("id"))
13301330
.filter(count__gt=1)
1331+
.values_list("pkg_name", flat=True)
13311332
)
1332-
return list(set(item["pkg_name"] for item in duplicates))
1333-
1334-
@classmethod
1333+
13351334
@profile_classmethod
1336-
def mark_items_as_duplicated(cls, issns):
1337-
ppx_duplicated_pkg_names = PidProviderXML.find_duplicated_pkg_names(issns)
1338-
if not ppx_duplicated_pkg_names:
1339-
return
1340-
cls.objects.filter(pkg_name__in=ppx_duplicated_pkg_names).exclude(
1341-
proc_status=choices.PPXML_STATUS_DUPLICATED
1342-
).update(
1343-
proc_status=choices.PPXML_STATUS_DUPLICATED,
1335+
def find_duplicated_v2(cls, issns):
1336+
# Busca em ambos os campos de ISSN
1337+
return (
1338+
cls.objects.filter(Q(issn_print__in=issns) | Q(issn_electronic__in=issns))
1339+
.exclude(v2__isnull=True)
1340+
.exclude(v2="")
1341+
.exclude(
1342+
proc_status__in=[
1343+
choices.PPXML_STATUS_DUPLICATED,
1344+
choices.PPXML_STATUS_INVALID,
1345+
]
1346+
)
1347+
.values("v2")
1348+
.annotate(count=Count("id"))
1349+
.filter(count__gt=1)
1350+
.values_list("v2", flat=True)
13441351
)
1345-
return ppx_duplicated_pkg_names
13461352

13471353
@classmethod
13481354
@profile_classmethod
1349-
def deduplicate_items(cls, user, issns):
1355+
def deduplicate_items(cls, user, issns, mark_as_duplicated=False, deduplicate=False):
13501356
"""
13511357
Corrige todos os artigos marcados como DATA_STATUS_DUPLICATED com base nos ISSNs fornecidos.
13521358
13531359
Args:
13541360
issns: Lista de ISSNs para verificar duplicatas.
13551361
user: Usuário que está executando a operação.
13561362
"""
1363+
duplicated_v2 = cls.find_duplicated_v2(issns)
1364+
if duplicated_v2.exists():
1365+
if mark_as_duplicated:
1366+
cls.objects.filter(v2__in=duplicated_v2).exclude(
1367+
proc_status=choices.PPXML_STATUS_DUPLICATED
1368+
).update(
1369+
proc_status=choices.PPXML_STATUS_DUPLICATED,
1370+
)
1371+
if deduplicate:
1372+
for v2 in duplicated_v2:
1373+
cls.fix_duplicated_items(user, None, v2)
1374+
13571375
duplicated_pkg_names = cls.find_duplicated_pkg_names(issns)
1358-
for pkg_name in duplicated_pkg_names:
1359-
cls.fix_duplicated_pkg_name(pkg_name, user)
1360-
return duplicated_pkg_names
1376+
if duplicated_pkg_names.exists():
1377+
if mark_as_duplicated:
1378+
cls.objects.filter(pkg_name__in=duplicated_pkg_names).exclude(
1379+
proc_status=choices.PPXML_STATUS_DUPLICATED
1380+
).update(
1381+
proc_status=choices.PPXML_STATUS_DUPLICATED,
1382+
)
1383+
if deduplicate:
1384+
for pkg_name in duplicated_pkg_names:
1385+
cls.fix_duplicated_items(user, pkg_name, None)
13611386

13621387
@classmethod
13631388
@profile_classmethod
1364-
def fix_duplicated_pkg_name(cls, pkg_name, user):
1389+
def fix_duplicated_items(cls, user, pkg_name, v2):
13651390
"""
13661391
Corrige items marcados como PPXML_STATUS_DUPLICATED com base no pkg_name fornecido.
13671392
13681393
Args:
1369-
pkg_name: Nome do pacote para verificar duplicatas.
13701394
user: Usuário que está executando a operação.
1371-
1395+
pkg_name: Nome do pacote para verificar duplicatas.
1396+
v2: Valor do pid v2 para verificar duplicatas.
13721397
Returns:
13731398
int: Número de items atualizados.
13741399
"""
13751400
try:
1376-
items = cls.objects.filter(pkg_name=pkg_name)
1401+
filters = Q()
1402+
if v2:
1403+
filters |= Q(v2=v2) | Q(other_pid__pid_in_xml=v2)
1404+
if pkg_name:
1405+
filters |= Q(pkg_name=pkg_name)
1406+
items = cls.objects.filter(filters)
13771407
if items.count() <= 1:
13781408
return 0
13791409

@@ -1409,7 +1439,7 @@ def fix_duplicated_pkg_name(cls, pkg_name, user):
14091439
UnexpectedEvent.create(
14101440
exception=exception,
14111441
exc_traceback=exc_traceback,
1412-
action="pid_provider.models.PidProviderXML.fix_duplicated_pkg_name",
1442+
action="pid_provider.models.PidProviderXML.fix_duplicated_items",
14131443
detail=pkg_name,
14141444
)
14151445

pid_provider/query_params.py

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,19 @@ def aop_pid(self):
6363

6464
@cached_property
6565
def pkg_name(self):
66-
"""Nome do pacote do documento."""
66+
"""Nome do pacote do documento, parâmtro usado ao instanciar XMLAdapter"""
6767
return self.xml_adapter.pkg_name
68-
68+
69+
@cached_property
70+
def sps_pkg_name(self):
71+
"""Nome do pacote do documento (deprecated)."""
72+
return self.xml_adapter.sps_pkg_name
73+
74+
@cached_property
75+
def deprecated_sps_pkg_name(self):
76+
"""Nome do pacote do documento (deprecated)."""
77+
return self.xml_adapter.sps_pkg_name
78+
6979
@cached_property
7080
def main_doi(self):
7181
"""DOI principal do documento."""
@@ -176,8 +186,15 @@ def identifier_queries(self):
176186
q |= Q(v2=self.aop_pid) | Q(aop_pid=self.aop_pid)
177187

178188
# Package name
189+
pkg_names = set()
179190
if self.pkg_name:
180-
q |= Q(pkg_name=self.pkg_name)
191+
pkg_names.add(self.pkg_name)
192+
if self.sps_pkg_name:
193+
pkg_names.add(self.sps_pkg_name)
194+
if self.deprecated_sps_pkg_name:
195+
pkg_names.add(self.deprecated_sps_pkg_name)
196+
if pkg_names:
197+
q |= Q(pkg_name__in=pkg_names)
181198

182199
# # DOI principal
183200
# if self.main_doi:

pid_provider/tasks.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,11 +281,8 @@ def task_fix_journal_pid_provider_xmls_status(
281281
if mark_as_invalid:
282282
PidProviderXML.mark_items_as_invalid(journal.issns)
283283

284-
if mark_as_duplicated:
285-
PidProviderXML.mark_items_as_duplicated(journal.issns)
286-
287-
if deduplicate:
288-
PidProviderXML.deduplicate_items(user, journal.issns)
284+
if mark_as_duplicated or deduplicate:
285+
PidProviderXML.deduplicate_items(user, journal.issns, mark_as_duplicated=mark_as_duplicated, deduplicate=deduplicate)
289286

290287
return {
291288
"status": "success",

requirements/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ pysolr==3.9.0 # https://pypi.org/project/pysolr/
8787
# ------------------------------------------------------------------------------
8888
tornado>=6.5.2 # not directly required, pinned by Snyk to avoid a vulnerability
8989
lxml==6.0.2 # https://github.com/lxml/lxml
90-
git+https://git@github.com/scieloorg/packtools@4.13.1#egg=packtools
90+
git+https://git@github.com/scieloorg/packtools@4.14.0#egg=packtools
9191

9292
# pymongo
9393
# ------------------------------------------------------------------------------

0 commit comments

Comments
 (0)