最近有个需求,使用c#操作world文件,并获取其中的标题及其他所需引导词后面的内容,如下图,获取文件中的标题,引导词后面的内容
采用的是open xml将world文件转换成xml(open xml只支持docx文件格式,如果不是docx格式的,可以先通过world转换过来),可以获取每个段落中的所有文字喝标签样式,已知标题的字体最大,根据标签样式属性,获取最大的样式的那个段落既是标题,其他内容可以通过indexOf查询关键词进行获取
先看看完成效果
代码如下(初学C#一周,如果有写的不好的地方,望大佬指正):
所需插件:
引入
using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Collections;
using System.Xml;
using DocumentFormat.OpenXml.Packaging;
定义
public class objList{public string title;public string biaoji1;public string number;public string biaoji3;}public class textObjectAll{public int index;public string innerXml;public string innerText;public int indexof;}public class textObjectCh{public int index;public string innerXml;public string innerText;public int indexof;}public enum WordKind{Title,biaoji1,number,biaoji3,}
方法:(这是测试文件,所以定义的少,实际文件中会有很多引导词)
public static object GetContentFromWord(string docPath, string[] Kind){const string wordmlNamespace &#61; "http://schemas.openxmlformats.org/wordprocessingml/2006/main";string text &#61; null;string keyword &#61; null;string keyworda &#61; null;string abstracta &#61; null;string biaoji1 &#61; "";string number &#61; "";string biaoji3 &#61; "";int num &#61; 0;int numa &#61; 0;int numb &#61; 0;int numx &#61; 0;ArrayList list &#61; new ArrayList();ArrayList lista &#61; new ArrayList();ArrayList listb &#61; new ArrayList();objList objList &#61; new objList();StringBuilder textBuilder &#61; new StringBuilder();using (WordprocessingDocument wdDoc &#61; WordprocessingDocument.Open(docPath, false)){NameTable nt &#61; new NameTable();XmlNamespaceManager nsManager &#61; new XmlNamespaceManager(nt);nsManager.AddNamespace("w", wordmlNamespace);XmlDocument xdoc &#61; new XmlDocument(nt);xdoc.Load(wdDoc.MainDocumentPart.GetStream());XmlNodeList paragraphNodes &#61; xdoc.SelectNodes("//w:p ", nsManager);List<textObjectCh> listCh &#61; new List<textObjectCh>();List<textObjectAll> listAll &#61; new List<textObjectAll>();for (var i &#61; 0; i < paragraphNodes.Count; i&#43;&#43;){if (!string.IsNullOrEmpty(paragraphNodes[i].InnerText)){XmlNodeList textNodes &#61; paragraphNodes[i].SelectNodes(".//w:pStyle", nsManager);XmlNodeList textNodesa &#61; paragraphNodes[i].SelectNodes(".//w:sz", nsManager);textObjectCh TextObjCh &#61; new textObjectCh();textObjectAll TextObjAll &#61; new textObjectAll();TextObjAll.index &#61; i;TextObjAll.innerText &#61; paragraphNodes[i].InnerText;TextObjAll.innerXml &#61; paragraphNodes[i].InnerXml;TextObjAll.indexof &#61; numb;listAll.Add(TextObjAll);string AllinnerText &#61; null;if (paragraphNodes[i].InnerText&#61;&#61;null){}elseAllinnerText &#61; paragraphNodes[i].InnerText.ToString();for (int v &#61; 0; v < Kind.Length; v&#43;&#43;){switch (Kind[v].ToString()){case "biaoji1":if (AllinnerText.IndexOf("标记1") > -1 && AllinnerText.IndexOf("号码") > -1){biaoji1 &#61; AllinnerText.Substring(AllinnerText.IndexOf("标记1") &#43; 4, AllinnerText.IndexOf("号码")- (AllinnerText.IndexOf("标记1")&#43;4));}break;case "number":if (AllinnerText.IndexOf("号码") > -1)number &#61; AllinnerText.Substring(AllinnerText.IndexOf("号码") &#43; 3);break;case "biaoji3":if (AllinnerText.IndexOf("标记3") >-1){biaoji3 &#61; AllinnerText.Substring(AllinnerText.IndexOf("DOI") &#43; 4);}break;}}if (textNodes.Count > 0){try{text &#61; textNodes[0].OuterXml;}catch { }num &#61; text.IndexOf("w:val");numa &#61; text.IndexOf("http://");if (numa - num > 20 && numa - num < 25){keyworda &#61; text.Substring(num);keyword &#61; text.Substring(num &#43; 9, numa - num - 20);numx &#61; 1;}}else{if (textNodesa.Count > 0){try{text &#61; textNodesa[0].OuterXml;}catch { }num &#61; text.IndexOf("w:val");numa &#61; text.IndexOf("xmlns:w");if (numa - num > 9 && numa - num < 20){keyword &#61; text.Substring(num &#43; 7, numa - num - 9);numx &#61; 2;}}}try{if (int.TryParse(keyword, out numb)){TextObjCh.index &#61; i;TextObjCh.innerText &#61; paragraphNodes[i].InnerText;TextObjCh.innerXml &#61; paragraphNodes[i].InnerXml;TextObjCh.indexof &#61; numb;listCh.Add(TextObjCh);}}catch{}}}if (numx &#61;&#61; 1){List<textObjectCh> listA &#61; listCh.OrderBy(item &#61;> item.indexof).ToList();objList.title &#61; listA[0].innerText;if (biaoji1.Length > 0){objList.biaoji1 &#61; biaoji1;}if (number.Length>0){objList.number &#61; number;}if (biaoji3.Length>0){objList.biaoji3 &#61; biaoji3;}return objList;}else{List<textObjectCh> listA &#61; listCh.OrderByDescending(item &#61;> item.indexof).ToList();objList.title &#61; listA[0].innerText;if (biaoji1.Length > 0){objList.biaoji1 &#61; biaoji1;}if (number.Length > 0){objList.number &#61; number;}if (biaoji3.Length > 0){objList.biaoji3 &#61; biaoji3;}return objList;}};}
调用方法&#xff1a;
private void button1_Click(object sender, EventArgs e){OpenFileDialog openFileDialog1 &#61; new OpenFileDialog(); openFileDialog1.InitialDirectory &#61; "c:\\";openFileDialog1.Filter &#61; "txt files (*.docx)|*.docx|All files (*.docx)|*.docx"; openFileDialog1.FilterIndex &#61; 2;openFileDialog1.RestoreDirectory &#61; true;string[] strText &#61; {WordKind.Title.ToString(),WordKind.biaoji1.ToString(),WordKind.number.ToString(),WordKind.biaoji3.ToString(),};if (openFileDialog1.ShowDialog() &#61;&#61; DialogResult.OK){this.FilePath.Text &#61; openFileDialog1.FileName; objList objList &#61; (objList)GetContentFromWord(openFileDialog1.FileName, strText);this.Title.Text &#61; objList.title;this.biaoji1.Text &#61; objList.biaoji1;this.number.Text &#61; objList.number;this.biaoji3.Text &#61; objList.biaoji3;}}