Just as a followup - I ended up using a Python script to recurse through the folders of files and, for each file found, extract the doctor/date/medical record number from the filename, then run a Word macro on the file to insert the header slug.
If anyone ever needs to do something similar, or just wants to laugh at what a hack this turned out to be, I've attached a text file containing the Python and the macro. (The UIDs and names have all been sanitized for my protection.) Bear in mind that this was a one-off; if I end up re-using this I will clean it up, put things in classes, load UIDs from an external file, etc. Background: In the folder "U:\transcripts.doc" there are folders called "Recent" and "Archive" for each of about twelve providers; I need only Able, Baker, Doe, Roe, Smith and Jones - that's why I put the folder names in a tuple instead of simply processing the root folder. Inside of each folder there are subfolders for individual patients, but also many patient documents in the main folder. File names are supposed to follow the format "Last,First-MRN-01Jan2008-Able.doc", but there are many variations on this theme - "Last,First-01Jan2008-MRN-Able.doc" "Last,First-MRN-01Jan2008-Able-Echo.doc" "Last,First-MRN-01Jan2008-Echo-Able.doc" "Last,First-01Jan2008-MRN-Echo-Able.doc" "Last,First-01Jan2008-MRN-Able-Echo.doc" etc. Last,First - the patient's name. Irrelevant to my purpose. MRN - medical record number Echo - most of these files are consultations, but the ones labeled "Echo" are echocardiogram reports. For these I need to set Category and Description to "Echo"; otherwise it's "Consultation External"/"Consultation". The doctor is supposed to dictate the MRN, and the transcriptionist puts it in the filename - but there are many cases where the MRN is missing. These can look like: "Last,First-XXX-01Jan2008-Able.doc" "Last,First--01Jan2008-Able.doc" "Last,First-01Jan2008-Able.doc" so I needed several different filters The date needs to be passed in MM/DD/YYYY format; I found that the easiest way to figure out which field was the date was to try/except strftime(). I'm sure there's a more elegant way using regexp, but I was in quick-and-dirty mode. As you might expect, most of the time is spent in Word. It would probably be faster if I set Visible = False, but for some reason the macro fails (silently) when I do that, so I just minimize Word instead to cut down on screen refreshes. Also, drive U is a Samba share, so there's network latency to take into account. Even so, 10,231 files took less than 15 minutes to convert, which I can live with. -- www.fsrtechnologies.com
#!/usr/bin/python import os, os.path, sys, time import logging from win32com.client import Dispatch startDirs =("u:/transcripts.doc/ABLE-Recent", "u:/transcripts.doc/ABLE-Archive", "u:/transcripts.doc/BAKER-Recent", "u:/transcripts.doc/BAKER-Archive", "u:/transcripts.doc/DOE-Recent", "u:/transcripts.doc/DOE-Archive", "u:/transcripts.doc/ROE-Recent", "u:/transcripts.doc/ROE-Archive", "u:/transcripts.doc/SMITH-Recent", "u:/transcripts.doc/SMITH-Archive", "u:/transcripts.doc/JONES-Recent", "u:/transcripts.doc/JONES-Archive") targetDir = "U:/xsRTF" class Global(object): prov = { 'ABLE': 'F000789F-3001-400F-845B-B0D78B0B000B', 'BAKER': 'B000FBD5-0007-4007-A959-60B63B32000C', 'DOE': '30008D6D-8008-400D-B031-700ECD75000C', 'ROE': '90002DC9-100A-4005-B284-D07632730000', 'SMITH': '7000FD10-5009-4000-9CDB-608FDEF2000F', 'JONES': 'F000DDE8-400E-400C-8B7D-10CE9F5C0000'} # There are UIDs for hospitals as well, but I haven't seen any dictations # for hospital visits - so I'm assuming everything happened in the office. # There may be some exceptions, and I may have to clean them up later... # but I don't see how I can help it now. locID = "90001E61-200A-4004-9006-700063790002" # the only extra description that ever comes up is "Echo", # but I put it in a dict just in case I found more desc = { 'ECHO': ('Echo','Echo'), } totalFiles = 0 skippedFiles = 0 failedFiles = 0 def doConversion(oldPath): oldName = os.path.basename(oldPath) # there are lots of daily roster files - we don't want them # also lots of orphaned temporary documents... if oldName.upper().strip().startswith(('ROSTER', '~'),0): Global.skippedFiles +=1 return nameParts = oldName.rstrip(".doc").split('-') # No MRN to be had from "Last,First-Date-Doc.doc" or # "Last,First--Date-Doc.doc" if ((len(nameParts) < 4) or (len(nameParts[1]) == 0)): logging.error("%s - no MRN" % oldPath) Global.failedFiles +=1 return newParts = {} try: # filename has date in standard position tmpDate = time.strptime(nameParts[2].strip(), "%d%B%Y") except ValueError: try: # "Last,First-Date-Doc-Echo.doc" tmpDate = time.strptime(nameParts[1].strip(), "%d%B%Y") logging.error("%s - no MRN" % oldPath) Global.failedFiles +=1 return except ValueError: try: # "Last,First-MRN-Echo-Date-Doc.doc" or "Last,First-MRN-Doc-Date.doc" # or some other bizarre combo - but we can work with it tmpDate = time.strptime(nameParts[3].strip(), "%d%B%Y") except ValueError: logging.error("%s - no date" % oldPath) Global.failedFiles +=1 return newParts['date'] = time.strftime("%m/%d/%Y", tmpDate) # save time by cleaning these up just once tmpStr1 = nameParts[-1].upper().strip() tmpStr2 = nameParts[-2].upper().strip() # Files may be "Last,First-MRN-Date-Doc.doc" (most common) OR # may have an extra description, like so: # "Last,First-MRN-Date-Echo-Doc.doc" OR # "Last,First-MRN-Date-Doc-Echo.doc" if tmpStr1 in Global.prov: newParts['prov'] = Global.prov[tmpStr1] elif tmpStr2 in Global.prov: newParts['prov'] = Global.prov[tmpStr2] else: logging.error("%s - no provider" % oldPath) Global.failedFiles +=1 return if tmpStr1 in Global.desc: newParts['cat'] = Global.desc[tmpStr1][0] newParts['desc'] = Global.desc[tmpStr1][1] elif tmpStr2 in Global.desc: newParts['cat'] = Global.desc[tmpStr2][0] newParts['desc'] = Global.desc[tmpStr2][1] else: newParts['cat'] = 'Consultation External' newParts['desc'] = 'Consultation' # lots and lots of files with no MRN but x's as placeholders like this: # "Last,First-xxx-Date-Doc.doc" if nameParts[1][1].upper() == 'X': logging.error("%s - no MRN" % oldPath) Global.failedFiles +=1 return else: newParts['MRN'] = nameParts[1].strip() newParts['loc'] = Global.locID try: doc = Global.wrd.Documents.Open(os.path.abspath(oldPath)) except Exception, exc: logging.error("opening %s -- %s" % (oldPath, exc)) Global.failedFiles +=1 try: Global.wrd.Run("RTF", newParts['MRN'], newParts['date'], newParts['prov'], newParts['cat'], newParts['loc'], newParts['desc'], targetDir) except Exception, exc: logging.error("error while converting %s -- %s" % (oldPath, exc)) Global.failedFiles +=1 # the guts... logging.basicConfig(level=logging.DEBUG, format='%(levelname)s %(message)s', filename='RTF.log', filemode='w') Global.wrd = Dispatch('Word.Application') Global.wrd.Visible = 1 time.clock() for startDir in startDirs: logging.info("Starting to process folder: %s" % startDir) folders = [startDir] while len(folders)>0: fld = folders.pop() for name in os.listdir(fld): fullpath = os.path.join(fld,name) if os.path.isfile(fullpath): Global.totalFiles +=1 #logging.debug(fullpath) doConversion(fullpath) elif os.path.isdir(fullpath): folders.append(fullpath) logging.info("Finished with folder: %s - time elapsed %.4f seconds" % (startDir, time.clock()/10)) Global.wrd.Quit() logging.info("All done! Total files: %s Skipped (rosters, etc.): %s Failed: %s - total processing time %.4f seconds" % (Global.totalFiles, Global.skippedFiles, Global.failedFiles, time.clock()/10)) #======================================================================== # Word Macro follows #======================================================================== Sub RTF(Optional ByVal MedRecNum As String = "", _ Optional ByVal dateStr As String = "", _ Optional ByVal ProvID As String = "", _ Optional ByVal CatID As String = "Consultation External", _ Optional ByVal LocID As String = "90000E61-200A-4004-9006-700863790002", _ Optional ByVal Desc As String = "Consultation", _ Optional ByVal Dest As String = "U:\xscripts) Dim rng As Range Dim NewName As String Set rng = ActiveDocument.Range(Start:=0, End:=0) rng.InsertBreak Type:=wdSectionBreakNextPage ActiveDocument.Sections(2).Headers(wdHeaderFooterPrimary).LinkToPrevious = False ActiveDocument.Sections(2).Footers(wdHeaderFooterPrimary).LinkToPrevious = False ActiveDocument.Sections(2).Headers(wdHeaderFooterFirstPage).LinkToPrevious = False ActiveDocument.Sections(2).Footers(wdHeaderFooterFirstPage).LinkToPrevious = False ActiveDocument.Sections(2).Headers(wdHeaderFooterEvenPages).LinkToPrevious = False ActiveDocument.Sections(2).Footers(wdHeaderFooterEvenPages).LinkToPrevious = False ActiveDocument.Sections(1).Headers(wdHeaderFooterPrimary).Range.Text = "" ActiveDocument.Sections(1).Footers(wdHeaderFooterPrimary).Range.Text = "" ActiveDocument.Sections(1).Headers(wdHeaderFooterFirstPage).Range.Text = "" ActiveDocument.Sections(1).Footers(wdHeaderFooterFirstPage).Range.Text = "" ActiveDocument.Sections(1).Headers(wdHeaderFooterEvenPages).Range.Text = "" ActiveDocument.Sections(1).Footers(wdHeaderFooterEvenPages).Range.Text = "" Set rng = ActiveDocument.Range(Start:=0, End:=0) rng.Font.Bold = False rng.Font.Name = "Arial" rng.Font.Size = 10 rng.ParagraphFormat.Alignment = wdAlignParagraphLeft rng.ParagraphFormat.LeftIndent = 0 rng.Text = vbCr + _ "Patient MRN: " + MedRecNum + vbCr + _ "Date: " + dateStr + vbCr + _ "Category: " + CatID + vbCr + _ "Description: " + Desc + vbCr + _ "Provider ID:" + ProvID + vbCr + _ "Location ID:" + LocID + vbCr + _ "Enterprise ID: 00001" + vbCr + _ "Practice ID: 0001" + vbCr rng.Font.Bold = False rng.Font.Name = "Arial" rng.Font.Size = 10 rng.ParagraphFormat.Alignment = wdAlignParagraphLeft rng.ParagraphFormat.LeftIndent = 0 NewName = ActiveDocument.Name + ".rtf" ChangeFileOpenDirectory Dest ActiveDocument.SaveAs FileName:=NewName, FileFormat:=wdFormatRTF ActiveWindow.Close End Sub
_______________________________________________ Tutor maillist - Tutor@python.org http://mail.python.org/mailman/listinfo/tutor