From 75ff7d4df33f61ab64f50396419aadf7ab7c8578 Mon Sep 17 00:00:00 2001
From: kirk0830 <yike.huang@aliyun.com>
Date: Thu, 5 Sep 2024 15:01:16 +0800
Subject: [PATCH] Enhance: enhance the exception handling on pp and improve the
 way to parse pp

---
 SIAB/io/pseudopotential/kernel.py         | 168 ++++++++++++++++++---
 SIAB/io/pseudopotential/tools/advanced.py | 170 +++++++++++++---------
 2 files changed, 253 insertions(+), 85 deletions(-)
diff --git a/SIAB/io/pseudopotential/kernel.py b/SIAB/io/pseudopotential/kernel.py
index 6ecbe417..b228720b 100644
--- a/SIAB/io/pseudopotential/kernel.py
+++ b/SIAB/io/pseudopotential/kernel.py
@@ -4,26 +4,52 @@ def iter_tree(root: ET.Element):
     return {child.tag: {"attrib": child.attrib, "data": child.text} for child in list(root.iter())}
 
 def preprocess(fname: str):
-    """ADC pseudopotential has & symbol at the beginning of line, which is not allowed in xml, replace & with &amp;"""
+    """ Pseudopotential XML file preprocess
+
+    preprocess is relatively hard-coded. There are some cases the UPF file is not standard xml, therefore
+    some modifications are needed. Cases considered:
+    1. ADC pseudopotential has & symbol at the beginning of line, which is not allowed in xml, replace `&` 
+    with `&amp;`
+    2. GBRV pseudopotential does not startswith <UPF version="2.0.1">, and not endswith </UPF>, add <UPF version="2.0.1">
+    to the beginning of the file and </UPF> to the end of the file
+    3. some of ADC pseudopotentials have `CDATA` tag, which is not allowed in xml, remove them
+    """
+    import re, uuid, os
+    ftemp = f"{str(uuid.uuid3(uuid.NAMESPACE_DNS, fname))}.xml"
+    #print(f"Preprocessing {fname}, will write standard XML formatted temporaray file to {ftemp}")
+    # because a pseudopotential file would not be large, directly read all lines into memory
     with open(fname, "r") as f:
         lines = f.readlines()
-    """GBRV pseudopotential does not startswith <UPF version="2.0.1">, but <PP_INFO>, 
-    add <UPF version="2.0.1"> to the beginning of the file and </UPF> to the end of the file"""
-    if not lines[0].startswith("<UPF version="):
-        lines.insert(0, "<UPF version=\"2.0.1\">\n")
-        lines.append("</UPF>")
-
-    with open(fname, "w") as f:
-        for line in lines:
-            """if line starts with &, replace & with &amp;, 
-            but if already &amp;, do not replace"""
-            if line.strip().startswith("&") and not line.strip().startswith("&amp;"):
-                line = line.replace("&", "&amp;")
-            
+    # it is compulsory for all XML files to start with <UPF version="2.0.1">
+    if not lines[0].startswith("<UPF version="): # not expected case, but how?
+        if lines[0].strip().startswith("<UPF version="):
+            # remove all left whitespaces
+            lines[0] = lines[0].strip() + "\n"
+        else:
+            lines.insert(0, "<UPF version=\"unknown\" comment=\"added to complete xml format\">\n")
+    # from the last line, check if the first line startswith `<` is not </UPF>. If not, add </UPF> to the end of the file
+    i = -1
+    while not lines[i].strip() and i > -len(lines):
+        i -= 1
+    if not "</UPF>" in lines[i]:
+        lines.append("</UPF>\n")
+    # write the modified lines back to the file
+    # there are more than one tasks remain:
+    # for ADC pseudopotential, replace `&` with `&amp;`
+    # for PseudoDojo v1.0 pseudopotential, carefully check consistency between opening and closing tags
+    # for ADC pseudopotential or ld1 generated, replace eliminate both `<![CDATA[` and `]]>` if they exist
+    lines = [line.replace("<![CDATA[", "").replace("]]>", "") for line in lines]
+    with open(ftemp, "w") as f:
+        # replace the `&` symbol at the beginning of the line with `&amp;`, this is done by xml_syntax_filter
+        for _, line in xml_syntax_filter(lines):
+        #for line in lines:
+            _match = re.match(r"^([\s]*)(\&[\w]+)([^;]*)(\s*)$", line)
+            line = f"{_match.group(1)}{_match.group(2).replace('&', '&amp;')}{_match.group(3)}{_match.group(4)}\n"\
+                   if _match else line
             f.write(line)
-
-            if line.strip() == "</UPF>":
+            if "</UPF>" in line: # there are some pseudopotential files endswith ppgen file, but will crash the xml parser
                 break
+    return os.path.abspath(ftemp)
 
 import SIAB.io.pseudopotential.tools.basic as siptb
 def postprocess(parsed: dict):
@@ -50,20 +76,22 @@ def postprocess(parsed: dict):
 
 def upf(fname: str):
     """parse the pseudopotential file, return a dictionary"""
+    import os
     error_msg = """ERROR: UPF file with non-XML format. Please contact with either developer
 of pseudopotential you use or the developer of this package. For the latter choice, 
 you can submit issue in Github Repository at:
 https://github.com/kirk0830/abacus_orbital_generation
 , thanks for understanding, raise TypeError and Quit..."""
-    preprocess(fname)
+    ftemp = preprocess(fname)
     try:
-        tree = ET.parse(fname)
+        tree = ET.parse(ftemp)
     except ET.ParseError:
         print(error_msg, flush=True)
         raise TypeError("ERROR: Please read the error message above.") from None
     root = tree.getroot()
     parsed = iter_tree(root)
     postprocess(parsed)
+    os.remove(ftemp)
     return parsed
     
 def vwr(fname: str):
@@ -191,6 +219,110 @@ def vwr(fname: str):
     }
     return out
 
+def xml_syntax_filter(content: str|list[str]):
+    """it is found that some pseudopotential may have incorrect xml format, this function is to check the consistency of tags
+    and correct if possible. A two-member tuple is returned, the first element is a boolean indicating whether the nearest tag is closed,
+    the second element is the corrected line.
+
+    ```python
+    with open("file.xml", "r") as f:
+        content = f.readlines()
+    for is_closed, line in xml_syntax_filter(content):
+        print(line)
+    ```
+    """
+    import re
+    content = content.split("\n") if isinstance(content, str) else content
+    # for there are tags like "<PP_HEADER" in the file, we need to keep the buffer
+    # then we can combine the buffer with the current line to form a complete tag
+    # like "<PP_HEADER .../>", then it will be a single tag
+    # or <PP_HEADER to form a complete "opening" tag <PP_HEADER ...>
+    buf = ""
+
+    # regular expression     example
+    resg    = r"<[^>]+/>"    # <.../>
+    reop    = r"<[^/][^>]+>" # <...>
+    recls   = r"</[^>]+>"    # </...>
+    relicmp = r"<[^>]+"      # <...
+    rericmp = r"[^>]+>"      # ...>
+    recmt   = r"<!--.*-->"   # <!--...-->
+    
+    # the stack to store the names of the tags, when a closing tag is found, it should be the same as the nearest opened tag
+    # otherwise, it is a mismatch error and correction is needed
+
+    # there is an exception caused by the opening comment: 20240506
+    inbuilt_name_stack = []
+    for line in content: # loop over all lines
+        l = line.strip() # copy the value then do the strip operation
+        # XML comments can be add both in one line and across line. However,
+        # only in pslibrary 1.0.0 the across-line-comment is encountered...
+        # but it is not clever to add more complexity (I mean more regular
+        # expressions) to handle this case
+        if re.match(recmt, l):
+            yield True, line
+        # single tag case, with the form <.../>, directly yield it, remember
+        # to yield the original line, rather than the stripped one
+        elif re.match(resg, l):
+            yield True, line
+        # opening tag case, with the form <...>. If this form appears, it
+        # is possible that there are plenty of data belonging to this tag.
+        # To save the tag name into the stack, then yield the original line
+        elif re.match(reop, l):
+            name = re.match(r"<([^ >]+)", l).group(1)
+            inbuilt_name_stack.append(name)
+            yield False, line
+        # closing tag case, with the form </...>, the end of the possibilty
+        # above. If this form appears, then it is the end of the tag, so pop
+        # the tag name from the stack, then yield the original line. However,
+        # for PseudoDojo v1.0 pseudopotential, there would be mismatch between
+        # the opening tag and the closing tag, so need to correct it
+        elif re.match(recls, l):
+            name = re.match(r"</([^ >]+)", l).group(1)
+            yield (True, line) if inbuilt_name_stack[-1] == name \
+                else (True, line.replace(name, inbuilt_name_stack[-1]))
+            inbuilt_name_stack.pop()
+        # the case that a "left tag incomplete", means the tag is not closed but like
+        # <PP_HEADER ...
+        # comment="PP_HEADER is the most typical case that not-closed tag"
+        # additional="but other tag can also be like this, e.g. PSWFC"
+        # ... />
+        # in this case, will use buffer to store all content until the tag is closed
+        # then yield-back the complete tag.
+        # however, must be sure the buffer is empty before the next tag
+        # EXCEPTION: read <!-- in
+        elif re.match(relicmp, l) and l != "<!--":
+            assert buf == "", "buffer is not empty"
+            buf = l
+            continue
+        elif re.match(rericmp, l) and l != "-->":
+            assert buf != "", "buffer is empty"
+            buf = " ".join([buf, l])
+            if re.match(resg, buf):
+                yield True, buf + "\n"
+                buf = ""
+            elif re.match(reop, buf):
+                name = re.match(r"<([^ >]+)", buf).group(1)
+                inbuilt_name_stack.append(name)
+                yield False, buf + "\n"
+                buf = ""
+            else:
+                raise ValueError(f"incorrect tag: {buf}")
+        # for not-tag case, might be data, so in this case, do not touch
+        # it. but it can also be within tag, say incomplete tag.
+        else:
+            # if buffer is not empty and the current line does not have any tag
+            # related information, then it would be the situation within the tag
+            # or out of any tags. Fortunately the tag will always start with its
+            # name, or say the symbol `<` always appear to be in the same line
+            # as the tag name, therefore once it is really within the tag, the
+            # buffer will at least has the content of tag name, rather than empty
+            if buf != "":
+                buf = " ".join([buf, l])
+                continue
+            # if buffer is empty, then it is not within any tag, just yield the line
+            else:
+                yield True, line
+
 import unittest
 class TestPspotKernel(unittest.TestCase):
     def test_vwr(self):
diff --git a/SIAB/io/pseudopotential/tools/advanced.py b/SIAB/io/pseudopotential/tools/advanced.py
index ae1a1b05..355440b2 100644
--- a/SIAB/io/pseudopotential/tools/advanced.py
+++ b/SIAB/io/pseudopotential/tools/advanced.py
@@ -3,80 +3,116 @@ def determine_type(parsed: dict):
     """pseudopotentials can be generated by not only one codes, to extract information 
     from PP_INFO, PP_INPUTFILE, need to know the exact way how information is organized
     """
-    error_msg = """ERROR: pseudopotential with not recognized type, this is due to limited support
+    error_msg = """
+##################################################################################################
+! ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR ERROR
+##################################################################################################
+
+Pseudopotential with not recognized type, this is due to only limited support
 implemented on orbital auto-generation feature. You can submit issue on Github Repository at:
-https://github.com/kirk0830/abacus_orbital_generation, and for instantly use, you can follow the
+https://github.com/kirk0830/abacus_orbital_generation. For instantly use, you can follow the
 following guidelines. Please check manually the pseudopotential file you provided and specify:
 1. ALL `nbands` with specific number
 2. ALL `nbands_ref` with specific number
-3. ALL `orb_conf` and `orb_ref` with specific configuration like either `[0, 1, 2, 3]` or `0s1p2d3f`
-
-ADDITIONALLY! Please check your input, if you are using `optimizer` = `bfgs`, with 
-`spill_guess` as `atomic`, a monomer (single atom) calculation will be run to generate the 
-initial guess for orbital optimization. You should explicitly create one section in
-`reference_systems` for monomer calculation (setting `shape` to be `monomer`), and specify the
-`nbands` rationally (can be very high, but may also be dangerous because PW calculation cannot
-calculate isolated atom electronic structure well, artifact brought about by PW will pollute the
-energy level information).
-
+3. ALL `lmaxmax` with specific number
+4. ALL `orb_conf` and `orb_ref` with specific configuration like either `[0, 1, 2, 3]` or `0s1p2d3f`
+5. If you are using `optimizer` = `bfgs`, with `spill_guess` as `atomic`, a monomer (single atom) 
+calculation will be run to generate the initial guess for orbital optimization. You should explicitly 
+create one section in `reference_systems` for monomer calculation (setting `shape` to be `monomer`), 
+and specify the `nbands` rationally (can be very high, but may also be dangerous because PW 
+calculation cannot calculate isolated atom electronic structure well, artifact brought about by PW 
+will pollute the energy level information).
+
+Make sure in your input the following tags are correctly set:
+{
+    "element": [the element symbol],
+
+    "reference_systems": [
+        {
+            "shape": "monomer",
+            "nbands": NUMBER,
+            "nspin": NUMBER,
+            "lmaxmax": NUMBER
+        },
+        {
+            "shape": "dimer",
+            "nbands": NUMBER,
+            "nspin": NUMBER,
+            "lmaxmax": NUMBER,
+            "bond_lengths": (can be "auto")
+        },
+        ...
+    ],
+    "orbitals": [
+        {
+            "zeta_notation": [S#]s[P#]p[D#]d[F#]f,
+            "nbands": NUMBER,
+            "orb_ref": "none" or [S#']s[P#']p[D#']d[F#']f,
+            "shape": (the same),
+        },
+        ...
+    ]
+}
 Then rerun the orbital generation.
 TypeError raised, Quit..."""
 
-    """ONCVPSP
-    ONCVPSP is the format of pseudopotential most seen in norm-conserving pseudopotential,
-    such as SG15, PD (developed by pwmat team?) and DOJO"""
-    if "ONCVPSP" in parsed["PP_INFO"]["data"]:
-        return "ONCVPSP"
-    if "ONCVPSP" in parsed["PP_HEADER"]["attrib"]["generated"]:
-        return "ONCVPSP"
-
-    """ADC
-    ADC is the format of pseudopotential collected in pslibrary, including
-    pslnc, rrkjus and kjpaw, most collected in QE website the pptable"""
-    if "ADC" in parsed["PP_INFO"]["data"]:
-        return "ADC"
-    if "ADC" in parsed["PP_HEADER"]["attrib"]["generated"]:
-        return "ADC"
-    if "ADC" in parsed["PP_HEADER"]["attrib"]["author"]:
-        return "ADC"
-    if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_INFO"]["data"]:
-        return "ADC"
-    if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["generated"]:
-        return "ADC"
-    if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["author"]:
-        return "ADC"
-    
-    """GTH
-    this is the kind developed by CP2K developers, Goedecker, Hartwigsen, Hutter and Teter
-    et al. However, this kind of pseudopotential has non-diagonal element in DIJ matrices,
-    which is not supported by ABACUS yet."""
-    if "Goedecker/Hartwigsen/Hutter/Teter" in parsed["PP_HEADER"]["attrib"]["author"]:
-        return "GTH"
-        raise NotImplementedError("GTH pseudopotential is not supported by ABACUS yet because of non-diagonal DIJ matrices")
-    
-    """GBRV
-    It is one of the most efficient pseudopotential presently, ABACUS pw supports this kind
-    of pseudopotential, ABACUS lcao not yet.
-    """
-    if "Generated using Vanderbilt code" in parsed["PP_INFO"]["data"]:
-        return "GBRV"
-
-    """ATOMPAW
-    atompaw looks like ADC but not quite the same in occupation information
-    Comparatively the uni_marburg is actually more similar to ADC"""
-    if "ATOMPAW" in parsed["PP_INFO"]["data"]:
-        return "ATOMPAW"
-    if "ATOMPAW" in parsed["PP_HEADER"]["attrib"]["generated"]:
-        return "ATOMPAW"
-    
-    """vwr
-    vwr is a old-fashioned pseudopotential format, there are codes can convert the vwr to upf format. Then this branch
-    will be called to parse the upf format pseudopotential"""
-    if "vwr" in parsed["PP_INFO"]["data"] or "VWR" in parsed["PP_INFO"]["data"]:
-        return "VWR"
-
-    print(error_msg, flush = True)
-    raise TypeError("ERROR: please see error message above.")
+    try:
+        """ONCVPSP
+        ONCVPSP is the format of pseudopotential most seen in norm-conserving pseudopotential,
+        such as SG15, PD (developed by pwmat team?) and DOJO"""
+        if "ONCVPSP" in parsed["PP_INFO"]["data"]:
+            return "ONCVPSP"
+        if "ONCVPSP" in parsed["PP_HEADER"]["attrib"]["generated"]:
+            return "ONCVPSP"
+
+        """ADC
+        ADC is the format of pseudopotential collected in pslibrary, including
+        pslnc, rrkjus and kjpaw, most collected in QE website the pptable"""
+        if "ADC" in parsed["PP_INFO"]["data"]:
+            return "ADC"
+        if "ADC" in parsed["PP_HEADER"]["attrib"]["generated"]:
+            return "ADC"
+        if "ADC" in parsed["PP_HEADER"]["attrib"]["author"]:
+            return "ADC"
+        if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_INFO"]["data"]:
+            return "ADC"
+        if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["generated"]:
+            return "ADC"
+        if "Generated using \"atomic\" code by A. Dal Corso" in parsed["PP_HEADER"]["attrib"]["author"]:
+            return "ADC"
+        
+        """GTH
+        this is the kind developed by CP2K developers, Goedecker, Hartwigsen, Hutter and Teter
+        et al. However, this kind of pseudopotential has non-diagonal element in DIJ matrices,
+        which is not supported by ABACUS yet."""
+        if "Goedecker/Hartwigsen/Hutter/Teter" in parsed["PP_HEADER"]["attrib"]["author"]:
+            return "GTH"
+            raise NotImplementedError("GTH pseudopotential is not supported by ABACUS yet because of non-diagonal DIJ matrices")
+        
+        """GBRV
+        It is one of the most efficient pseudopotential presently, ABACUS pw supports this kind
+        of pseudopotential, ABACUS lcao not yet.
+        """
+        if "Generated using Vanderbilt code" in parsed["PP_INFO"]["data"]:
+            return "GBRV"
+
+        """ATOMPAW
+        atompaw looks like ADC but not quite the same in occupation information
+        Comparatively the uni_marburg is actually more similar to ADC"""
+        if "ATOMPAW" in parsed["PP_INFO"]["data"]:
+            return "ATOMPAW"
+        if "ATOMPAW" in parsed["PP_HEADER"]["attrib"]["generated"]:
+            return "ATOMPAW"
+        
+        """vwr
+        vwr is a old-fashioned pseudopotential format, there are codes can convert the vwr to upf format. Then this branch
+        will be called to parse the upf format pseudopotential"""
+        if "vwr" in parsed["PP_INFO"]["data"] or "VWR" in parsed["PP_INFO"]["data"]:
+            return "VWR"
+        
+    except KeyError:
+        print(error_msg, flush = True)
+        raise TypeError("ERROR: please see error message above.")
 
 def val_conf(parsed: dict):
     """extract valence electron configuration from pseudopotential file