mromain
XLDnaute Barbatruc
Bonjour à tous,
J’ai récemment eu à récupérer des informations issues de tableaux Word via PowerQuery.
Les seules solutions que j’ai alors trouvées consistaient à convertir au préalable le document Word dans un autre format (pdf, html) pour pouvoir le lire ensuite via PowerQuery.
La fonction ExtractWordTables ci-dessous permet de se passer de cette étape de conversion pour attaquer directement le fichier Word (au format Open XML).
Elle est sûrement imparfaite, un peu lente, mais elle a bien répondu à mon besoin, du coup je partage.
A+
Edit : compatibilité XL 2013
J’ai récemment eu à récupérer des informations issues de tableaux Word via PowerQuery.
Les seules solutions que j’ai alors trouvées consistaient à convertir au préalable le document Word dans un autre format (pdf, html) pour pouvoir le lire ensuite via PowerQuery.
La fonction ExtractWordTables ci-dessous permet de se passer de cette étape de conversion pour attaquer directement le fichier Word (au format Open XML).
Elle est sûrement imparfaite, un peu lente, mais elle a bien répondu à mon besoin, du coup je partage.
Code:
let
fn = (wordFilePath as text) as table =>
let
// unzip function (source: https://github.com/ibarrau/PowerBi-code/blob/master/PowerQuery/ExtractZIP.pq)
fnUnzipFile = (ZIPFile as binary) =>
let
Header = BinaryFormat.Record([ Signature = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32,ByteOrder.LittleEndian),
Version = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16,ByteOrder.LittleEndian),
Flags = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16,ByteOrder.LittleEndian),
Compression = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16,ByteOrder.LittleEndian),
ModTime = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16,ByteOrder.LittleEndian),
ModDate = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16,ByteOrder.LittleEndian),
CRC32 = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32,ByteOrder.LittleEndian),
CompressedSize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32,ByteOrder.LittleEndian),
UncompressedSize = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger32,ByteOrder.LittleEndian),
FileNameLen = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16,ByteOrder.LittleEndian),
ExtraFieldLen = BinaryFormat.ByteOrder(BinaryFormat.UnsignedInteger16,ByteOrder.LittleEndian)
]),
FileEntry = BinaryFormat.Choice(Header, each if _[Signature] <> 0x4034B50 then BinaryFormat.Null else
BinaryFormat.Record([
Header = _,
FileName = BinaryFormat.Text(_[FileNameLen]),
ExtraField = BinaryFormat.Text(_[ExtraFieldLen]),
UncompressedData = BinaryFormat.Transform(BinaryFormat.Binary(_[CompressedSize]),(x) => try Binary.Buffer(Binary.Decompress(x, Compression.Deflate)) otherwise null)
]), type binary),
ZipFormat = BinaryFormat.List(FileEntry, each _<> null),
Entries = List.Transform(
List.RemoveLastN( ZipFormat(ZIPFile), 1),
(e) => [FileName = e[FileName], Content = e[UncompressedData] ]
)
in
Table.FromRecords(Entries),
// function used to perform multiple Text.Replace
fnReplace = (text as text, replacements as list) as text => let t=Text.Replace(text, replacements{0}{0}, replacements{0}{1}) in if List.Count(replacements)=1 then t else @fnReplace(t, List.RemoveFirstN(replacements, 1)),
// function used to format a javascript string
fnFormatJsString = (text as text) as text => fnReplace(text, {{"\", "\\"}, {"'", "\'"}, {"#(lf)", "\n"}}),
// regex Replace
fnRegExpReplace = (text as nullable text,pattern as nullable text,replace as nullable text) as text =>
let
scriptJS = "var input = '" & fnFormatJsString(text) & "';
var regExp = " & pattern & ";
var replace = '" & fnFormatJsString(replace) & "';
var result = input.replace(regExp, replace);
document.write('""' + result + '""')",
webPage = Web.Page("<script>" & scriptJS & "</script>"),
resultJS = webPage{0}[Data]{0}[Children]{1}[Children]{0}[Text],
toText = Expression.Evaluate(resultJS)
in
toText,
// find all regex matches
fnGetRegExpMatches = (text as text, pattern as text) as table =>
let
scriptJS = "var input = '" & fnFormatJsString(text) & "';
var regExp = " & pattern & ";
var match;
var matchesInfos = [];
var result;
var nbSubMatches=0;
while (match = regExp.exec(input)){
var matchInfos = [];
if (match.length-1 > nbSubMatches){
nbSubMatches = match.length-1;
}
matchInfos.push(match.index.toString());
for (var i = 0; i < match.length; i++){
matchInfos.push('""' + match[i].toString().replace(/""/g, '""""') + '""');
}
matchesInfos.push('{' + matchInfos.join(', ') + '}');
}
result = '#table(type table [Position=number, Match=text'
for (var i = 1; i <= nbSubMatches; i++){
result += (', Group' + i.toString() + '=text');
}
result += '], {' + matchesInfos.join(', ') + '})';
result = result.replace(/[<>&\n]/g, function(x) {return {'<': '<', '>': '>', '&': '&', '\n': '<br />'}[x];});
document.write(result)",
webPage = Web.Page("<script>" & scriptJS & "</script>"),
resultJS = webPage{0}[Data]{0}[Children]{1}[Children]{0}[Text],
toTable = Expression.Evaluate(resultJS)
in
toTable,
// function used to get 'word/document.xml' of an openXml Word document
fnGetDocumentXml = (filePath as text) as text =>
let
FileBinary = try Binary.Buffer(File.Contents(filePath)) otherwise error [Reason="Source file", Message="Source file was not found (" & filePath & ")."],
FileContent = fnUnzipFile(FileBinary) ,
FiltreDocContent = let rowDocumentXml = Table.SelectRows(FileContent, each ([FileName] = "word/document.xml")) in if Table.RowCount(rowDocumentXml) = 1 then rowDocumentXml else error [Reason="Source file", Message="Source file is not a valid OpenXML Word file (" & filePath & ")."],
TexteDocContent = Table.FromColumns({Lines.FromBinary(FiltreDocContent{0}[Content],null,null,65001)}){1}[Column1]
in
TexteDocContent,
// function used to remove code tags (manage tabulation and new lines)
fnCleanText = (xmlText as text) as text =>
let
xmlParagraphs = Text.Split(Text.Replace(xmlText, "<w:br/>", "</w:p>"), "</w:p>"),
splitTabs = List.Transform(xmlParagraphs, each Text.Split(_, "<w:tab/>")),
cleanSplitTabs = List.Transform(splitTabs, (listXmls as list) as list => List.Transform(listXmls, each fnRegExpReplace(_, "/<((?!>).)*>/g", ""))),
joinTabs = let listParagraphs = List.Transform(cleanSplitTabs, each Text.Combine(_, "#(tab)")) in if List.Last(listParagraphs)="" then List.RemoveLastN(listParagraphs, 1) else listParagraphs,
joinParagraphs = Text.Combine(joinTabs, "#(lf)")
in
joinParagraphs,
// function used to manage a cell (even if merged)
fnExtractCell = (cellXml as text) as list =>
let
nbMerge = let val = Number.From(Text.BetweenDelimiters(cellXml, "<w:gridSpan w:val=""", """")) in if val = null then 1 else val,
cellValue = let rawVal = fnCleanText(cellXml) in if rawVal = "" then null else rawVal,
listMergedCells = List.Transform({1 .. nbMerge}, each if _=1 then cellValue else null)
in
listMergedCells,
// function used to split a row in columns items
fnExtractColumnsValues = (xmlColumns as list, nbCol as number) as list =>
let
cleanColumns = List.Transform(xmlColumns, fnExtractCell),
combineColumns = List.Combine(cleanColumns),
completeColumns = let cols = List.Count(combineColumns) in combineColumns & (if cols=nbCol then {} else List.Transform({1 .. nbCol-cols}, each null))
in
completeColumns,
// function used to convert a openXml table code in table
fnExtractTable = (tableXml as text) as table =>
let
nbCol = List.Count(Text.Split(tableXml, "<w:gridCol"))-1,
rowsMatches = fnGetRegExpMatches(tableXml, "/<w:tr[ >](((?!<w:tr[ >]).)*)<\/w:tr>/g"),
extractColumns = Table.AddColumn(rowsMatches, "Columns", each fnGetRegExpMatches([Match], "/<w:tc[ >](((?!<w:tc[ >]).)*)<\/w:tc>/g")[Match], type list),
extractColumnsValues = Table.TransformColumns(extractColumns,{{"Columns", each fnExtractColumnsValues(_, nbCol)}}),
ToTable = Table.FromRows(extractColumnsValues[Columns])
in
ToTable,
documentXml = fnGetDocumentXml(wordFilePath),
tablesMatches = fnGetRegExpMatches(documentXml, "/<w:tbl>(((?!<w:tbl>).)*)<\/w:tbl>/g")[[Match]],
colIndex = Table.AddIndexColumn(tablesMatches, "Index", 1, 1),
colTable = Table.AddColumn(colIndex, "Table", each fnExtractTable([Match]), type table),
result = Table.RemoveColumns(colTable,{"Match"})
in
result,
documentationMetadata =
[
Documentation.Name = "ExtractWordTables",
Documentation.Description = "This function extract tables of an OpenXml Word file (docx, docm, ...).",
Documentation.Examples =
{
[
Description = "List all tables of ""c:\folder\document.docx"" document:",
Code = "ExtractWordTables(""c:\folder\document.docx"")",
Result = "-- Table with all document tables. --"
],
[
Description = "Extract first table of ""c:\folder\document.docx"" document:",
Code = "ExtractWordTables(""c:\folder\document.docx""){0}[Table]",
Result = "-- First table of the document. --"
]
}
]
in
Value.ReplaceType(fn, Value.ReplaceMetadata(Value.Type(fn), documentationMetadata))
A+
Edit : compatibilité XL 2013
Dernière édition: