-
Notifications
You must be signed in to change notification settings - Fork 2
/
process_xml.py
1173 lines (1012 loc) · 42.7 KB
/
process_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Pam April 2019
# Parsing jats DTD, the standard for Europe PMC
# see https://jats.nlm.nih.gov/
import sys
import codecs
import os
import glob
import json
import re
from optparse import OptionParser
from datetime import datetime
from lxml import etree
def get_file_content(name):
f = open(name,'r')
#first_line = f.readline()
#forbidden_header = '<?xml version="1.0" encoding="UTF-8"?>'
#if first_line.startswith(forbidden_header):
# first_line = first_line[len(forbidden_header):]
# print('got header about UTF-8 encoding, will skip it')
#f_text = first_line + f.read()
f_text = f.read()
f.close()
return f_text
def cleanup_input_xml(xmlstr):
# we remove this header which is redundant and puzzles the lxml parser
# not needed (implicitly done below)
#xmlstr = xmlstr.replace('<?xml version="1.0" encoding="UTF-8"?>', '', 1)
# we remove everything before first appearance of <article...>
pos = xmlstr.index("<article")
xmlstr = xmlstr[pos:]
# we remove everything after last appearance of </article>
pos = xmlstr.rindex("</article>") + 10
xmlstr = xmlstr[0:pos]
# we remove any default namespace in document
# xmlstr = re.sub('xmlns="\S+"', '', xmlstr)
xmlstr = re.sub('xmlns="\S*?"', '', xmlstr)
# we add an extra space after end of comment tag '-->'
xmlstr = xmlstr.replace("-->","--> ")
return xmlstr
# helper function, used for stats printing
def get_cardinality(n):
if n>1: return 'N'
return str(n)
# helper function, used for stats printing
def get_el_cardinality(someroot, somepath):
c = get_cardinality(len(someroot.xpath(somepath)))
return somepath + ':' + c
# helper function, used for stats printing
def get_stats(fname, someroot):
line = 'pam-stats' + '\t'
line += fname + '\t'
line += get_el_cardinality(someroot,'/article/front/article-meta/abstract') + '\t'
line += get_el_cardinality(someroot,'/article/body/p') + '\t'
line += get_el_cardinality(someroot,'/article/body/sec')
return line
# helper function, used for stats printing
# lists the tags of elements containing a <fig> for a given file
def get_fig_parents(fname, someroot):
parents={}
figs = someroot.xpath('/article/body//fig')
if figs is not None:
for fig in figs:
parent_tag=fig.getparent().tag
if parents.get(parent_tag) is None: parents[parent_tag]=0
parents[parent_tag]=parents[parent_tag]+1
lines=[]
for p in parents:
line = 'fig-stats' + '\t' + fname + '\t<' + p + '>:' + str(parents[p])
lines.append(line)
return lines
# helper function, used for stats printing
# lists the tags of elements containing a <table-wrap> for a given file
def get_tw_parents(fname, someroot):
parents={}
tws = someroot.xpath('/article/body//table-wrap')
if tws is not None:
for tw in tws:
parent_tag=tw.getparent().tag
if parents.get(parent_tag) is None: parents[parent_tag]=0
parents[parent_tag]=parents[parent_tag]+1
lines=[]
for p in parents:
line = 'tw-stats' + '\t' + fname + '\t<' + p + '>:' + str(parents[p])
lines.append(line)
return lines
# helper function, used for stats printing
# lists the tags of elements that are direct children of <body>
def get_body_structure(fname, someroot):
line = 'pam-struc' + '\t'
line += fname + '\t'
atype = someroot.xpath('/article')[0].get('article-type')
line += atype + '\t'
myroots = someroot.xpath('/article/body')
if len(myroots)>0:
myroot=myroots[0]
for el in myroot.iterchildren():
if isinstance(el, etree._Comment): continue
line += el.tag + ','
return line
def get_keywords(someroot):
kwd_list = someroot.xpath('/article//kwd')
if kwd_list is None: return []
result = []
for k in kwd_list:
result.append(clean_string(' '.join(k.itertext())))
return result
def get_multiple_texts_from_xpath(someroot, somepath, withErrorOnNoValue):
result = ''
x = someroot.xpath(somepath)
for el in x: result += ' '.join(el.itertext())
if len(result) >= 1:
result=clean_string(result)
elif withErrorOnNoValue:
file_status_add_error("ERROR, no text for element: " + somepath)
return result
def get_text_from_xpath(someroot, somepath, withWarningOnMultipleValues, withErrorOnNoValue):
result = ''
x = someroot.xpath(somepath)
if len(x) >= 1:
result = get_clean_text(x[0])
#result = x[0].text
if len(x) > 1 and withWarningOnMultipleValues is True :
file_status_add_error('WARNING: multiple elements found: ' + somepath)
elif withErrorOnNoValue is True:
file_status_add_error("ERROR, no text for element: " + somepath)
return result
def get_pub_date_by_type(someroot,selector,pubtype,format):
if not pubtype is None: selector += '[@pub-type="' + pubtype + '"]'
dates = someroot.xpath(selector);
if len(dates)==0: return {'date': None, 'status':'not found'}
dt = dates[0]
status = 'ok'
ynode = dt.find('year')
year = ynode.text if ynode is not None and ynode.text is not None else ''
year = year.strip()
if len(year)==0: return {'date': None, 'status': 'incomplete'}
mnode = dt.find('month')
mm = '01'
if mnode is not None and mnode.text is not None:
mm = mnode.text
mm = mm.strip()
if len(mm)==1: mm="0" + mm
else:
status = 'incomplete'
mmm_names=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
mmm = ''
if mm.isdigit() and int(mm)>0 and int(mm)<=12:
mmm = mmm_names[int(mm)-1]
else:
status = 'unparseable'
dnode = dt.find('day')
day = '01'
if dnode is not None and dnode.text is not None:
day = dnode.text
day = day.strip()
if len(day)==1: day = "0" + day
else:
status = 'incomplete'
formatted_date = year + ' ' + mmm + ' ' + day # default format
if format=='yyyy': formatted_date = year
if format=='d-M-yyyy': formatted_date = day + '-' + mm + '-' + year
return {'date': formatted_date, 'status': status}
# easiest way to retrieve publication date: take first in the list
def get_first_pub_date(someroot,format):
selector = '/article/front/article-meta/pub-date'
return get_pub_date_by_type(someroot, selector, None, format)
# alternative way to retrieve publication date: use precedence by pub-type
# precedence order: pmc-release, epub, ppub.
# we assume pmc-release and epub are complete dates (with month and day)
# we then try ppub and add month = 1 and day = 1 it they are tmissing
# algo decided in accordance with Julien
def get_pub_date(someroot,format):
selector = '/article/front/article-meta/pub-date'
dt = get_pub_date_by_type(someroot, selector, 'epub', format)
if dt['status'] != 'ok': dt = get_pub_date_by_type(someroot, selector, 'ppub', format)
if dt['status'] != 'ok': dt = get_pub_date_by_type(someroot, selector, 'collection', format)
if dt['status'] != 'ok': dt = get_pub_date_by_type(someroot, selector, None, format)
if dt['status'] != 'ok': file_status_add_error('ERROR, element not found: ' + selector)
return dt
def get_pmc_release_date(someroot,format):
selector = '/article/front/article-meta/pub-date'
dt = get_pub_date_by_type(someroot, selector, 'pmc-release', format)
if dt['status'] != 'ok':
return {'date': None, 'status': "ok"}
return dt
def build_medlinePgn(fp,lp):
if fp!=None and len(fp)>0 and lp!=None and len(lp)>0: return fp + '-' + lp
if fp!=None and len(fp)>0: return fp + '-?'
if lp!=None and len(lp)>0: return '?-' + lp
return ''
def get_affiliations(someroot):
result=[]
affs = someroot.xpath('/article/front/article-meta//aff')
for aff in affs:
id=aff.get('id')
# extract label text and then remove node
label_node = aff.find('label')
label = get_clean_text(label_node)
# !!! DO NOT USE line below: it removes the node tail as well
# if label_node is not None: aff.remove(label_node, keep_tail=True)
# use line below instead
if label_node is not None: label_node.text = ''
# try to build name from institut and country
institution = get_clean_text(aff.find('institution'))
country = get_clean_text(aff.find('country'))
if len(institution)>0 and len(country)>0:
name = institution + ', ' + country
# otherwise build name from any text found in there
else:
name = get_clean_text(aff)
result.append({'id':id, 'label':label, 'name': clean_string(name)})
return result
def get_authors(someroot):
authors = someroot.xpath('/article/front/article-meta/contrib-group/contrib[@contrib-type="author"]');
result = []
for a in authors:
surname = ''
givennames = ''
affiliation_list = []
for el in a.iter():
if el.tag == 'surname':
if el.text != None: surname = clean_string(el.text)
elif el.tag == 'given-names':
if el.text != None: givennames = clean_string(el.text)
# affiliations
elif el.tag == 'xref' and el.get('ref-type')=='aff':
if el.get('rid') != None: affiliation_list.append(el.get('rid'))
# affiliations (alternative)
elif el.tag == 'aff':
if el.text != None: affiliation_list.append(clean_string(el.text))
author = {}
author['affiliations'] = affiliation_list
author['last_name'] = surname
author['first_name'] = givennames
author['name'] = (givennames + ' ' + surname).strip()
author['initials'] = get_initials(givennames)
result.append(author)
if len(result)==0: file_status_add_error("WARNING: no authors")
return result
def get_initials(multiple_names):
if multiple_names=='': return ''
names = multiple_names.split(' ')
initials = ''
for name in names:
# sometimes we have consecutive spaces in names causing name = ' '
if len(name.strip()) > 0: initials += name[0]
return initials
def clean_string(s1):
# replaces new line, unbreakable space, TAB with SPACE and strip the final string
# also replaces multiple spaces with a single one
if s1 is None: return None
s2 = s1.replace('\n', ' ').replace(u'\u00a0', ' ').replace('\t', ' ').strip()
return ' '.join(s2.split())
def get_abstract(someroot):
x = someroot.xpath('/article/front/article-meta/abstract')
content=''
for xi in x:
content += ' '.join(xi.itertext()) + ' '
return clean_string(content)
# helper function, for stats printing
def indent(level):
spaces = ''
for i in range(1,level): spaces += ' '
return spaces
def coalesce(*arg):
for el in arg:
if el is not None:
return el
return None
# we remove any boxed-text from the XML tree
# they are not in the body text flow (illustrative prurpose)
# this is a temp simple solution
# rare case: less than 1 < 10'000 publication
def handle_boxed_text_elements(someroot):
bt_list = someroot.xpath('//boxed-text')
if bt_list is None: return
if bt_list==[]: return
for bt in bt_list: bt.getparent().remove(bt)
file_status_add_error('WARNING: removed some <boxed-text> element(s)')
def remove_alternative_title_if_redundant(someroot):
ttl = someroot.find('./front/article-meta/title-group/article-title')
alt = someroot.find('./front/article-meta/title-group/alt-title')
if ttl is not None and alt is not None: alt.getparent().remove(alt)
# we remove all elements and their subtree having tag in tag_list
def remove_subtree_of_elements(someroot, tag_list):
el_list = someroot.iter(tag_list)
for el in el_list: el.getparent().remove(el)
def handle_table_wrap(pmcid, tw):
xref_id = tw.get('id') or ''
xref_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/' + pmcid + '/table/' + xref_id
label=get_clean_text(tw.find('label'))
caption=get_clean_text(tw.find('caption'))
footer=get_clean_text(tw.xpath('table-wrap-foot'))
media_hrefs = [ get_xlink_href(el) for el in tw.xpath('media') ]
graph_hrefs = [ get_xlink_href(el) for el in tw.xpath('graphic') ]
# table content
columns=[]
row_values=[]
table_xml=b''
table_tree = tw.find('table')
if table_tree is None: table_tree = tw.find('alternatives/table')
if table_tree is not None:
table_xml = etree.tostring(table_tree)
columns, row_values = table_to_df(table_xml)
return {'tag': 'table', 'xref_id': xref_id, 'xref_url': xref_url,
'label': label, 'caption': caption, 'footer':footer,
'media':media_hrefs, 'graphics':graph_hrefs,
'table_columns': columns, 'table_values': row_values,
'xml':table_xml.decode("utf-8")}
def table_to_df(table_text):
table_tree = etree.fromstring(table_text)
columns = []
for tr in table_tree.xpath('thead/tr'):
for c in tr.getchildren():
col_content = ' '.join(c.itertext())
col_content = col_content.strip()
columns.append(col_content)
row_values = []
len_rows = []
for tr in table_tree.findall('tbody/tr'):
es = tr.xpath('td')
row_value = list()
for e in es:
joined_e = ' '.join(e.itertext())
clean_e = joined_e.strip()
row_value.append(clean_e)
len_rows.append(len(es))
row_values.append(row_value)
if len(len_rows) >= 1:
# - pam 15.12.2023: we remove this check on column count:
# it appears that we miss useful data because of this check
# making sure the the number of columns is the same in every row is complicated and
# not useful, we are interested by content not exact position in the table
#len_row = max(set(len_rows), key=len_rows.count)
#row_values = [r for r in row_values if len(r) == len_row] # remove row with different length
# end comment pam
return columns, row_values
else:
return None, None
def get_clean_text(el):
if el is None: return ''
if type(el) == list:
# sub_str_list = []
# for sub_el in el: sub_str_list.append(' '.join(sub_el.itertext()))
# return clean_string(' '.join(sub_str_list))
return clean_string(' '.join([' '.join(sub_el.itertext()) for sub_el in el]))
else:
return clean_string(' '.join(el.itertext()))
def modify_insert_text_in_sub_element(ins_texts, subel_tag, el):
texts=[]
texts.extend(ins_texts)
subel = el.find(subel_tag) # only first match cos cardinality for caption and label is 0-1
if (subel is not None): texts.append(' '.join(subel.itertext()))
new_text = clean_string(' '.join(texts))
# rebuild subelement with its new text content
for subel in el.iterchildren(subel_tag): el.remove(subel)
new_subel = etree.SubElement(el, subel_tag)
new_subel.text = new_text
# easy way to get value of
# attribute 'href' or '{http://www.w3.org/1999/xlink}href'
def get_xlink_href(el):
if el is None: return None
for k in el.keys():
if k[-4:]=='href': return el.get(k)
return None
# modifies the original XML by:
# 1. moving <table-wrap> elements next to their embedding <supplementary_material> element
# 2. removing <supplementary_material> elements from XML
# Note: we ignore implicit embedded figure (there may be a figure label, caption, etc...)
def handle_supplementary_material_elements(someroot):
etree.strip_tags(someroot,'supplementary-material')
def handle_supplementary_material_elements_ori(someroot):
sm_list = someroot.xpath('//supplementary-material')
if sm_list is None: return
for sm in sm_list:
for el in sm.iterchildren('table-wrap','p','fig'):
sm.addprevious(el)
# # After moving <table-wrap> elements try build a figure obj with the remaining content if any
# # Note: we create a figure but if might be a table as well we can't guess...
# label=get_clean_text(sm.find('label'))
# caption=get_clean_text(sm.find('caption'))
# media=[ get_xlink_href(m) for m in sm.xpath('media') ]
# graph=[ get_xlink_href(g) for g in sm.xpath('graphic') ]
# if (label != '' or caption != '') and (len(media)>0 or len(graph)>0):
# fig = etree.SubElement(sm.getparent(), 'fig')
# fig.attrib['id']='' # there is a special handling of figure with no id
# etree.SubElement(fig,'label').text=label
# etree.SubElement(fig,'caption').text=caption
# for m in media: etree.SubElement(fig,'media').attrib['href']=m
# for g in graph: etree.SubElement(fig,'graphic').attrib['href']=g
# removes supplementary_material which is now unnecesssary
sm.getparent().remove(sm)
# modifies the original XML by:
# 1. adding <table-wrap-group> caption and label text to each child <table-wrap> element caption
# 2. moving <table-wrap> elements next to their embedding <table-wrap-group>
# 3. removing <table-wrap-group> elements from XML
def handle_table_wrap_group_elements(someroot):
g_list = someroot.xpath('//table-wrap-group')
if g_list is None: return
for g in g_list:
# store table-wrap-group caption and label
g_captions=[]
for gc in g.iterchildren('caption'): g_captions.append(' '.join(gc.itertext()))
g_labels=[]
for gl in g.iterchildren('label'): g_labels.append(' '.join(gl.itertext()))
for tw in g.xpath('table-wrap'):
modify_insert_text_in_sub_element(g_labels, 'label', tw)
modify_insert_text_in_sub_element(g_captions, 'caption', tw)
# moves tw as the previous sibling of table-wrap-group
g.addprevious(tw)
# removes fig-group which is now unnecesssary
g.getparent().remove(g)
# modifies the original XML by
# 1. moving <el_tag> elements next to their embedding <el_tag>-group
# 2. removing <el_tag>-group from elements from XML
def remove_embedding_group_elements(someroot, el_tag):
g_list = someroot.xpath('//' + el_tag + '-group')
if g_list is None: return
for g in g_list:
for el in g.xpath(el_tag):
g.addprevious(el)
g.getparent().remove(g)
# modifies the original XML by
# 1. adding <fig-group> caption text to each child <fig> element caption
# 2. moving <fig> elements next to their embedding <fig-group>
# 3. removing <fig-group> from elements from XML
def handle_fig_group_elements(someroot):
fg_list = someroot.xpath('//fig-group')
if fg_list is None: return
for fg in fg_list:
# store fig-group caption
fg_captions=[]
for fgc in fg.iterchildren('caption'): fg_captions.append(' '.join(fgc.itertext()))
for fig in fg.xpath('fig'):
modify_insert_text_in_sub_element(fg_captions, 'caption', fig)
# moves fig as the previous sibling of fig-group
fg.addprevious(fig)
# removes fig-group which is now unnecesssary
fg.getparent().remove(fg)
def handle_fig(pmcid, fig):
xref_id = fig.get('id') or ''
xref_url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/' + pmcid + '/figure/' + xref_id
if xref_id == '': xref_url = ''
fig_label = get_clean_text(fig.find('label'))
fig_caption = get_clean_text(fig.find('caption'))
media_hrefs = [ get_xlink_href(el) for el in fig.xpath('media') ]
graph_hrefs = [ get_xlink_href(el) for el in fig.xpath('graphic') ]
return {'tag':'fig', 'caption': fig_caption, 'xref_url': xref_url,
'xref_id': xref_id, 'label': fig_label, 'media': media_hrefs,
'graphics': graph_hrefs, 'pmcid':pmcid }
def handle_list(list):
contentList=[]
for el in list.iterchildren(['list-item']):
contentList.append({'tag': 'list-item', 'text': clean_string(' '.join(el.itertext()))})
tail = clean_string(list.tail)
if tail is not None: contentList.append({'tag': 'p', 'text': tail})
return contentList
# a paragraph <p> may contain <fig> and / or <table-wrap> and / or <list> elements.
# if this is the case figs, tables and lists are parsed with their own handler
# and appended in order in the content list returned
def handle_paragraph(pmcid,el):
simplify_node(el, ['fig','table-wrap','list'])
contentList=[]
ptext=clean_string(el.text)
if ptext is not None and ptext != '': contentList.append({'tag':'p', 'text': ptext})
#for sub_el in el.iterchildren(['fig','table-wrap','list']):
# we should only have fig, table-wrap, list sub-elements after simplifying above
for sub_el in el.iterchildren():
if sub_el.tag == 'fig':
# parse the inner fig and add result to content list
contentList.append(handle_fig(pmcid,sub_el))
elif sub_el.tag == 'table-wrap':
# parse the inner table and add result to content list
contentList.append(handle_table_wrap(pmcid,sub_el))
elif sub_el.tag == 'list':
contentList.extend(handle_list(sub_el))
else:
contentList.append({'tag':sub_el.tag, 'text': get_clean_text(sub_el)})
ptail=clean_string(el.tail)
if ptail is not None and ptail != '': contentList.append({'tag':'p', 'text': ptail})
return contentList
def handle_paragraph_old(pmcid,el):
contentList=[]
for sub_el in el.iterchildren(['fig','table-wrap']):
if sub_el.tag == 'fig':
# parse the inner fig and add result to content list
contentList.append(handle_fig(pmcid,sub_el))
elif sub_el.tag == 'table-wrap':
# parse the inner table and add result to content list
contentList.append(handle_table_wrap(pmcid,sub_el))
# remove fig / table from paragraph
sub_el.getparent().remove(sub_el)
# now we got rif of any table, fig, so let's build paragraph content and
# set result at first rank in content list
content = {'tag': el.tag, 'text': clean_string(' '.join(el.itertext()))}
contentList.insert(0,content)
return contentList
# recursive function used to parse the article body (or floats-group or back node too).
# the body tree is traversed depth first:
# on encountering a section <sec> or <app> or <boxed-text> element, the function calls itself
# on encountering <p>, <fig>, <list> and <table-wrap> elements, dedicated handlers are called
# on encountering another element, a default handler is used
def handle_section_flat(pmcid, sec, level, implicit, block_id):
sectionList = []
id = ''.join(sec.xpath('@id'))
title = get_clean_text(sec.find('title'))
caption = get_clean_text(sec.find('caption'))
label = get_clean_text(sec.find('label'))
mainSection = {'implicit':implicit, 'level': level, 'id': build_id(block_id),
'title': title,
'label': label,
'caption': caption,
'tag': sec.tag,
'contents':[]}
# we add main section to the list before any other sub sections
sectionList.append(mainSection)
# print(indent(level) + 'level: ' + str(level) + ' - name: ' + mainSection['name'])
block_id.append(0)
terminalContentShouldBeWrapped=False
for el in sec:
# ignore elements handled elsewhere or that are unnecessary
if isinstance(el, etree._Comment): continue
if isinstance(el, etree._XSLTProcessingInstruction): continue
if isinstance(el, etree._ProcessingInstruction): continue
if el.tag == 'title': continue
if el.tag == 'label': continue
if el.tag == 'caption': continue
# recursive call for any embedded section <sec>, <boxed-text> and/or <app> (appendices)
if el.tag == 'sec' or el.tag == 'app' or el.tag == 'boxed-text':
block_id[-1] = block_id[-1] + 1
terminalContentShouldBeWrapped=True
sectionList.extend(handle_section_flat(pmcid, el, level + 1, False, block_id))
continue
contentsToBeAdded=[]
# handle paragraphs: will return paragraph content plus any embedded figures or tables as sibling contents
if el.tag == 'p':
contentsToBeAdded = handle_paragraph(pmcid, el)
elif el.tag == 'fig':
contentsToBeAdded = [ handle_fig(pmcid, el) ]
elif el.tag == 'table-wrap':
contentsToBeAdded = [ handle_table_wrap(pmcid, el) ]
elif el.tag == 'list':
contentsToBeAdded = handle_list(el)
# default handler: just keep tag and get all text
else:
#print("el : " + str(el))
#print("el tag : " + str(el.tag));
sometext = clean_string(' '.join(el.itertext()))
if sometext is not None and sometext != '':
contentsToBeAdded = [ {'tag': el.tag, 'text': sometext} ]
addContentsOrWrappedContents(sectionList, mainSection, contentsToBeAdded, level, terminalContentShouldBeWrapped)
block_id.pop()
return sectionList
# We want the order of contents to be preserved during parsing
# When we meet a section having a mix of content types including a sub section the order may be lost if we don't wrap some contents
# XML Example:
# <sec id="main_sec">
# <p id="p1">...</p>
# <sec id="sub_sec"><p id="p2">...</p></sec>
# <p id="p3">...</p>
# </sec>
# If we don't wrap p2 in a fake section, then the content orde§r would become
# main_sec: [p1, p3]
# sub_sec : [p2]
# By using the method below we will generated
# main_sec: [p1]
# sub_sec : [p2]
# wrap_sec: [p3]
def addContentsOrWrappedContents(sectionList, currentSection, contentsToBeAdded, level, shouldBeWrapped):
if contentsToBeAdded==[]: return
targetContents = currentSection['contents']
if shouldBeWrapped:
block_id[-1] = block_id[-1] + 1
wid = build_id(block_id)
subSection = {'implicit':True, 'level':level+1, 'id':wid, 'title':'', 'label':'' ,'tag':'wrap', 'contents':[]}
sectionList.append(subSection)
targetContents=subSection['contents']
block_id.append(0)
for content in contentsToBeAdded:
block_id[-1] = block_id[-1] + 1
content['id'] = build_id(block_id)
targetContents.append(content)
if shouldBeWrapped:
block_id.pop()
def build_id(a):
#print(a)
id = ''
for num in block_id: id += str(num) + '.'
return id[0:-1]
# ------------------------------------------
def file_status_reset():
file_status['name'] = ''
file_status['errors'].clear()
def file_status_set_name(n):
file_status['name'] = n
def file_status_add_error(r):
file_status['errors'].append(r)
def file_status_ok():
return len(file_status['errors'])==0
def file_status_print():
msg = file_status['name'] + '\t'
msg += str(len(file_status['errors'])) + '\t'
for r in file_status['errors']: msg += r + '\t'
print(msg)
# - - - - - - - - - - - - - - - - - - - - - - - -
# used by jsonpmc_httpserver.py
# - - - - - - - - - - - - - - - - - - - - - - - -
def parse_PMC_XML(xmlstr):
return parse_PMC_XML_core(xmlstr,None, None)
# - - - - - - - - - - - - - - - - - - - - - - - -
# used by jsonpmc_httpserver.py
# - - - - - - - - - - - - - - - - - - - - - - - -
def getPmcFtpUrl(xmlstr):
root = etree.fromstring(xmlstr)
lnk = root.xpath('/OA/records/record/link')
if lnk is not None:
if len(lnk) == 1: return lnk[0].get('href')
if len(lnk) > 1:
print("WARNING: multiple archives available on ftp, choosing format tgz")
for itm in lnk:
if itm.get('format')=='tgz': return itm.get('href')
return None
# - - - - - - - - - - - - - - - - - - - - - - - -
def parse_PMC_XML_core(xmlstr, root, input_file):
xmlstr = cleanup_input_xml(xmlstr)
if root is None:
root = etree.fromstring(xmlstr)
if input_file is None:
input_file = '(unknown file name)'
# (re)init stats variable
file_status_reset()
file_status_set_name(input_file)
# (re)init global variable block_id used for building section / block ids
block_id.clear()
# (re)init output variable
dict_doc = {}
# Preprocessing tasks: simplify / clean up of the original xml
# To be kept here before any parsing aimed at retrieving data
for xs in root.xpath('//xref/sup'): xs.getparent().remove(xs)
for sx in root.xpath('//sup/xref'): sx.getparent().remove(sx)
etree.strip_tags(root,'sup')
etree.strip_tags(root,'italic')
etree.strip_tags(root,'bold')
etree.strip_tags(root,'sub')
etree.strip_tags(root,'ext-link')
# rename this erroneous element
for el in root.xpath('/article/floats-wrap'): el.tag='floats-group'
etree.strip_elements(root, 'inline-formula','disp-formula', with_tail=False)
#remove_subtree_of_elements(root,['inline-formula','disp-formula'])
handle_supplementary_material_elements(root)
handle_table_wrap_group_elements(root)
handle_fig_group_elements(root)
remove_embedding_group_elements(root,'fn') # removes fn-group wrapper (foot-notes)
remove_embedding_group_elements(root,'app') # removes app-group wrapper (appendices)
remove_alternative_title_if_redundant(root)
# End preprocessing
# Now retrieve data from refactored XML
dict_doc['affiliations'] = get_affiliations(root)
dict_doc['authors'] = get_authors(root)
# note: we use xref to retrieve author affiliations above this line
etree.strip_tags(root,'xref')
dict_doc['article_type'] = root.xpath('/article')[0].get('article-type')
# namespace xml = {http://www.w3.org/XML/1998/namespace}
lng = root.xpath('/article')[0].get('{http://www.w3.org/XML/1998/namespace}lang')
if lng == None : lng = ''
dict_doc['language'] = lng[0:2]
# note: we can get multiple journal-id elements with different journal-id-type attributes
dict_doc['medline_ta'] = get_text_from_xpath(root, '/article/front/journal-meta/journal-id', False, True)
dict_doc['journal'] = get_multiple_texts_from_xpath(root, '/article/front/journal-meta//journal-title', True)
# note: I did not see any multiple <article-title> elements but we retrieve each element of the hypothetical list just in case
#dict_doc['title'] = get_multiple_texts_from_xpath(root, '/article/front/article-meta/title-group/article-title', True)
dict_doc['title'] = get_multiple_texts_from_xpath(root, '/article/front/article-meta/title-group', True)
dict_doc['pmid'] = get_text_from_xpath(root, '/article/front/article-meta/article-id[@pub-id-type="pmid"]', True, False)
dict_doc['doi'] = get_text_from_xpath(root, '/article/front/article-meta/article-id[@pub-id-type="doi"]', True, False)
# the archive and manuscript types are used for preprints
dict_doc['archive_id'] = get_text_from_xpath(root, '/article/front/article-meta/article-id[@pub-id-type="archive"]', True, False)
dict_doc['manuscript_id'] = get_text_from_xpath(root, '/article/front/article-meta/article-id[@pub-id-type="manuscript"]', True, False)
# we might find at least one of these:
pmc1 = get_text_from_xpath(root, '/article/front/article-meta/article-id[@pub-id-type="pmc-uid"]', True, False)
pmc2 = get_text_from_xpath(root, '/article/front/article-meta/article-id[@pub-id-type="pmc"]', True, False)
pmc = pmc1
if pmc == '': pmc = pmc2
if pmc == '' and dict_doc['archive_id'] == '': file_status_add_error("ERROR, no value for article id in types pmc-uid, pmc, or archive")
dict_doc['pmcid'] = pmc
dict_doc['_id'] = pmc
# if we have no pmc id then use the archive id (for preprints)
if pmc == '' and dict_doc['archive_id'] != '': dict_doc['_id'] = dict_doc['archive_id']
# ok with Julien, see precedence rules in def get_pub_date()
dict_doc['publication_date'] = get_pub_date(root, 'd-M-yyyy')['date']
dict_doc['pmc_release_date'] = get_pmc_release_date(root, 'd-M-yyyy')['date']
#dict_doc['publication_date_alt'] = get_pub_date(root, 'default format')['date'] # 'yyyy MMM d'
dict_doc['pubyear'] = get_pub_date(root, 'yyyy')['date']
#dict_doc['publication_date_status']=get_pub_date(root, 'yyyy')['status']
dict_doc['issue'] = get_text_from_xpath(root, '/article/front/article-meta/issue', True, False)
dict_doc['volume'] = get_text_from_xpath(root, '/article/front/article-meta/volume', True, False)
fp = get_text_from_xpath(root, '/article/front/article-meta/fpage', False, False)
lp = get_text_from_xpath(root, '/article/front/article-meta/lpage', False, False)
dict_doc['start_page'] = fp
dict_doc['end_page'] = lp
dict_doc['medline_pgn'] = build_medlinePgn(fp,lp)
#dict_doc['abstract'] = get_abstract(root) -- should be obsolete now
dict_doc['abstract'] = get_clean_text(root.find('front/article-meta/abstract'))
dict_doc['keywords'] = get_keywords(root)
# filling body, back and floats sections
dict_doc['body_sections'] = []
block_id.append(1)
if dict_doc['title'] != '':
dict_doc['body_sections'].append({
'implicit':True, 'level':1, 'id':'1', 'label':'', 'title':'Title',
'contents': [{'tag':'p', 'id':'1.1', 'text': dict_doc['title']}]})
block_id[-1] = block_id[-1] + 1
if dict_doc['abstract'] != '':
abs_node = root.find('./front/article-meta/abstract')
abs_title = etree.SubElement(abs_node, "title")
abs_title.text = 'Abstract'
sectionList = handle_section_flat(dict_doc['_id'], abs_node, 1, False, block_id)
dict_doc['body_sections'].extend(sectionList)
block_id[-1] = block_id[-1] + 1
dict_doc['body_sections'].extend(get_sections(dict_doc['pmcid'], root.find('body')))
dict_doc['float_sections']=get_sections(dict_doc['pmcid'], root.find('floats-group'))
dict_doc['back_sections']=get_sections(dict_doc['pmcid'], root.find('back'))
# for stats and debugging, can be commented
dict_doc['figures_in_body']=len(root.xpath('/article/body//fig'))
dict_doc['figures_in_back']=len(root.xpath('/article/back//fig'))
dict_doc['figures_in_float']=len(root.xpath('/article/floats-group//fig'))
dict_doc['tables_in_body']=len(root.xpath('/article/body//table'))
dict_doc['tables_in_back']=len(root.xpath('/article/back//table'))
dict_doc['tables_in_float']=len(root.xpath('/article/floats-group//table'))
dict_doc['paragraphs_in_body']=len(root.xpath('/article/body//p'))
dict_doc['paragraphs_in_back']=len(root.xpath('/article/back//p'))
dict_doc['paragraphs_in_float']=len(root.xpath('/article/floats-group//p'))
# for compatibility reasons
dict_doc['pmcid']='PMC' + dict_doc['pmcid']
dict_doc['_id'] = dict_doc['pmcid']
# in case of a preprint we only have an archive id, we store it as the _id
if dict_doc['pmcid'] == 'PMC' and dict_doc['archive_id'] != '': dict_doc['_id'] = dict_doc['archive_id']
# if we have no pmcid, store an empty string for it
if dict_doc['pmcid'] == 'PMC': dict_doc['pmcid'] = ''
return dict_doc
def get_sections(pmcid, node):
if node is None: return []
sections = handle_section_flat(pmcid, node, 1, True, block_id)
block_id[-1] = block_id[-1] + 1
return sections
# Recursively visits sub-elements of node.
# Sub-elements having a tag in kept_tags (i.e. fig, table, list) are left unchanged as well as their own sub-elements
# Other sub-elements are removed but their text / tail are attached to the appropriate sibling or embedding element.
# Old method is buggy and obsolete, do NOT use it
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def simplify_node(el, kept_tags, starting=True):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
print("Choosing old / new", use_old)
if use_old:
simplify_node_old(el, kept_tags)
else:
simplify_node_new(el, kept_tags)
# Recursively visits sub-elements of node.
# buggy method replad with new one in june 2023 defined above
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def simplify_node_old(el, kept_tags, starting=True):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
print("Using old")
if starting:
for subel in el.iterchildren():
simplify_node_old(subel, kept_tags, False)
elif el.tag not in kept_tags: # we stringify this el
trg_node = el.getprevious() if el.getprevious() is not None else el.getparent()
trg_attr = 'tail' if el.getprevious() is not None else 'text'
if el.text is not None: setattr(trg_node, trg_attr, (getattr(trg_node, trg_attr) or '') + el.text)
for subel in el.iterchildren():
el.addnext(subel)
simplify_node_old(subel, kept_tags, False)
if el.tail is not None: setattr(trg_node, trg_attr, (getattr(trg_node ,trg_attr) or '') + el.tail)
el.getparent().remove(el)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def simplify_node_new(node ,kept_tags):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
print("Using new")
elems = list()
recursive_simplify_node(node, kept_tags, elems)
node_tail = node.tail
node.clear()
node.tail = node_tail
trg = node
for el in elems:
if type(el) is str:
if trg == node:
trg.text = el if trg.text is None else trg.text + el
else:
trg.tail = el if trg.tail is None else trg.tail + el
else:
trg = el
node.append(trg)
# Recursively visits sub-elements of node.
# Sub-elements having a tag in kept_tags (i.e. fig, table, list) are left unchanged as well as their own sub-elements
# Other sub-elements are removed but their text / tail are attached to the appropriate sibling or embedding element.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def recursive_simplify_node(node ,kept_tags, elems, top_level=True):
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
if node.tag in kept_tags:
elems.append(node)
else:
if node.text is not None: elems.append(node.text)
for child in node.iterchildren(): recursive_simplify_node(child, kept_tags, elems, False)
if node.tail is not None and not top_level:
top_level = False
elems.append(node.tail)
# - - - - - - - - - - - - - - - - -
def main():
# - - - - - - - - - - - - - - - - -
global use_old
usage = "%prog file"
parser = OptionParser()
parser.add_option("-f","--file", dest="filename", help="Process one file for now")
# use_old: option used for debugging the old version of simplify_node
parser.add_option("-o","--old", dest="use_old", default=False, action="store_true", help="Use old or new node simplifier")
(options,args) = parser.parse_args()
if len(args) < 1:
sys.exit("Please provide a file")
else:
input_file = args[0]
use_old = options.use_old
file_status_reset()
file_status_set_name(input_file)
print('------ ' + str(datetime.now()) + ' ' + input_file)
xmlstr=get_file_content(input_file)
xmlstr=cleanup_input_xml(xmlstr)
print("========")
print(xmlstr[0:600])
print("...")
print(xmlstr[-600:])
print("========")
root = etree.fromstring(xmlstr)
lines = get_fig_parents(input_file,root)
lines.extend(get_tw_parents(input_file,root))
for l in lines: print(l)
dict_doc = parse_PMC_XML_core(xmlstr,root,input_file)
if len(dict_doc['body_sections'])<2: file_status_add_error("ERROR: no section after title")
if not file_status_ok(): file_status_print()
print(get_stats(input_file,root))
print(get_body_structure(input_file,root))
output_file='outfile'
subdir='out'
if 'pmcid' in dict_doc.keys():
subdir = subdir + '/' + dict_doc['pmcid'][0:2]
output_file = 'pmc'+ dict_doc['pmcid']
if not os.path.exists(subdir):
os.makedirs(subdir)
if use_old:
output_file += "-old"
output_file += '.json'
out_file = codecs.open(subdir + '/' + output_file,'w','utf-8')
out_file.write(json.dumps(dict_doc, sort_keys=True, indent=2))
out_file.close()
print("output file is " + subdir + '/' + output_file)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Tests (please ignore)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def test1():