Возможно, вам будет полезно извлечь из документа только текст. Это полезно, если вы передаете текст в другую службу API. Весь текст документа содержится на его вкладках в текстовых фрагментах элементов абзаца . Извлечение всего текста в документе включает в себя обход иерархии дерева вкладок и вызов методов получения из Tab
и DocumentTab
. Дополнительную информацию о функции вкладок см. в разделе Работа с вкладками .
Текст может появляться в трех типах структурных элементов вкладки документа:
- Параграф
- Оглавление
- Таблицы
Таблицы могут быть вложены в другую таблицу. Следовательно, чтобы извлечь весь текст документа, необходимо посетить каждый вложенный структурный элемент.
Полное описание тела документа см. в руководстве «Структура документа» .
В следующем примере API Документов Google используется рекурсия для посещения каждого структурного элемента на всех вкладках документа и печати текста.
// Copyright 2019 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
import com.google.api.client.auth.oauth2.Credential;
import com.google.api.client.extensions.java6.auth.oauth2.AuthorizationCodeInstalledApp;
import com.google.api.client.extensions.jetty.auth.oauth2.LocalServerReceiver;
import com.google.api.client.googleapis.auth.oauth2.GoogleAuthorizationCodeFlow;
import com.google.api.client.googleapis.auth.oauth2.GoogleClientSecrets;
import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
import com.google.api.client.http.javanet.NetHttpTransport;
import com.google.api.client.json.JsonFactory;
import com.google.api.client.json.jackson2.JacksonFactory;
import com.google.api.client.util.store.FileDataStoreFactory;
import com.google.api.services.docs.v1.Docs;
import com.google.api.services.docs.v1.DocsScopes;
import com.google.api.services.docs.v1.model.Document;
import com.google.api.services.docs.v1.model.DocumentTab;
import com.google.api.services.docs.v1.model.ParagraphElement;
import com.google.api.services.docs.v1.model.StructuralElement;
import com.google.api.services.docs.v1.model.Tab;
import com.google.api.services.docs.v1.model.TableCell;
import com.google.api.services.docs.v1.model.TableRow;
import com.google.api.services.docs.v1.model.TextRun;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
public class ExtractText {
private static final String APPLICATION_NAME = "Google Docs API Extract Guide";
private static final JsonFactory JSON_FACTORY = JacksonFactory.getDefaultInstance();
private static final String TOKENS_DIRECTORY_PATH = "tokens";
private static final String DOCUMENT_ID = "YOUR_DOCUMENT_ID";
/**
* Global instance of the scopes required by this quickstart. If modifying these scopes, delete
* your previously saved tokens/ folder.
*/
private static final List<String> SCOPES =
Collections.singletonList(DocsScopes.DOCUMENTS_READONLY);
private static final String CREDENTIALS_FILE_PATH = "/credentials.json";
/**
* Creates an authorized Credential object.
*
* @param HTTP_TRANSPORT The network HTTP Transport.
* @return An authorized Credential object.
* @throws IOException If the credentials.json file cannot be found.
*/
private static Credential getCredentials(final NetHttpTransport HTTP_TRANSPORT)
throws IOException {
// Load client secrets.
InputStream in = ExtractText.class.getResourceAsStream(CREDENTIALS_FILE_PATH);
GoogleClientSecrets clientSecrets =
GoogleClientSecrets.load(JSON_FACTORY, new InputStreamReader(in));
// Build flow and trigger user authorization request.
GoogleAuthorizationCodeFlow flow =
new GoogleAuthorizationCodeFlow.Builder(HTTP_TRANSPORT, JSON_FACTORY, clientSecrets, SCOPES)
.setDataStoreFactory(new FileDataStoreFactory(new java.io.File(TOKENS_DIRECTORY_PATH)))
.setAccessType("offline")
.build();
LocalServerReceiver receiver = new LocalServerReceiver.Builder().setPort(8888).build();
return new AuthorizationCodeInstalledApp(flow, receiver).authorize("user");
}
/**
* Adds the provided tab to the list of all tabs, and recurses through and
* adds all child tabs.
*/
private void addCurrentAndChildTabs(Tab tab, List<Tab> allTabs) {
allTabs.add(tab);
for (Tab tab: tab.getChildTabs()) {
addCurrentAndChildTabs(tab, allTabs);
}
}
/**
* Returns a flat list of all tabs in the document in the order they would
* appear in the UI (top-down ordering). Includes all child tabs.
*/
private List<Tab> getAllTabs(Document doc) {
List<Tab> allTabs = new ArrayList<>();
// Iterate over all tabs and recursively add any child tabs to generate a
// flat list of Tabs.
for (Tab tab: doc.getTabs()) {
addCurrentAndChildTabs(tab, allTabs);
}
return allTabs;
}
/**
* Returns the text in the given ParagraphElement.
*
* @param element a ParagraphElement from a Google Doc
*/
private static String readParagraphElement(ParagraphElement element) {
TextRun run = element.getTextRun();
if (run == null || run.getContent() == null) {
// The TextRun can be null if there is an inline object.
return "";
}
return run.getContent();
}
/**
* Recurses through a list of Structural Elements to read a document's text where text may be in
* nested elements.
*
* @param elements a list of Structural Elements
*/
private static String readStructuralElements(List<StructuralElement> elements) {
StringBuilder sb = new StringBuilder();
for (StructuralElement element : elements) {
if (element.getParagraph() != null) {
for (ParagraphElement paragraphElement : element.getParagraph().getElements()) {
sb.append(readParagraphElement(paragraphElement));
}
} else if (element.getTable() != null) {
// The text in table cells are in nested Structural Elements and tables may be
// nested.
for (TableRow row : element.getTable().getTableRows()) {
for (TableCell cell : row.getTableCells()) {
sb.append(readStructuralElements(cell.getContent()));
}
}
} else if (element.getTableOfContents() != null) {
// The text in the TOC is also in a Structural Element.
sb.append(readStructuralElements(element.getTableOfContents().getContent()));
}
}
return sb.toString();
}
public static void main(String... args) throws IOException, GeneralSecurityException {
// Build a new authorized API client service.
final NetHttpTransport HTTP_TRANSPORT = GoogleNetHttpTransport.newTrustedTransport();
Docs service =
new Docs.Builder(HTTP_TRANSPORT, JSON_FACTORY, getCredentials(HTTP_TRANSPORT))
.setApplicationName(APPLICATION_NAME)
.build();
// Fetch the document with all of the tabs populated, including any nested
// child tabs.
Document doc =
service.documents().get(DOCUMENT_ID).setIncludeTabsContent(true).execute();
List<Tab> allTabs = getAllTabs(doc);
// Print the text from each tab in the document.
for (Tab tab: allTabs) {
// Get the DocumentTab from the generic Tab.
DocumentTab documentTab = tab.getDocumentTab();
System.out.println(
readStructuralElements(documentTab.getBody().getContent()));
}
}
}
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Recursively extracts the text from a Google Doc.
"""
import googleapiclient.discovery as discovery
from httplib2 import Http
from oauth2client import client
from oauth2client import file
from oauth2client import tools
SCOPES = 'https://www.googleapis.com/auth/documents.readonly'
DISCOVERY_DOC = 'https://docs.googleapis.com/$discovery/rest?version=v1'
DOCUMENT_ID = 'YOUR_DOCUMENT_ID'
def get_credentials():
"""Gets valid user credentials from storage.
If nothing has been stored, or if the stored credentials are invalid,
the OAuth 2.0 flow is completed to obtain the new credentials.
Returns:
Credentials, the obtained credential.
"""
store = file.Storage('token.json')
credentials = store.get()
if not credentials or credentials.invalid:
flow = client.flow_from_clientsecrets('credentials.json', SCOPES)
credentials = tools.run_flow(flow, store)
return credentials
def add_current_and_child_tabs(tab, all_tabs):
"""Adds the provided tab to the list of all tabs, and recurses through and
adds all child tabs.
Args:
tab: a Tab from a Google Doc.
all_tabs: a list of all tabs in the document.
"""
all_tabs.append(tab)
for tab in tab.get('childTabs'):
add_current_and_child_tabs(tab, all_tabs)
def get_all_tabs(doc):
"""Returns a flat list of all tabs in the document in the order they would
appear in the UI (top-down ordering). Includes all child tabs.
Args:
doc: a document.
"""
all_tabs = []
# Iterate over all tabs and recursively add any child tabs to generate a
# flat list of Tabs.
for tab in doc.get('tabs'):
add_current_and_child_tabs(tab, all_tabs)
return all_tabs
def read_paragraph_element(element):
"""Returns the text in the given ParagraphElement.
Args:
element: a ParagraphElement from a Google Doc.
"""
text_run = element.get('textRun')
if not text_run:
return ''
return text_run.get('content')
def read_structural_elements(elements):
"""Recurses through a list of Structural Elements to read a document's text
where text may be in nested elements.
Args:
elements: a list of Structural Elements.
"""
text = ''
for value in elements:
if 'paragraph' in value:
elements = value.get('paragraph').get('elements')
for elem in elements:
text += read_paragraph_element(elem)
elif 'table' in value:
# The text in table cells are in nested Structural Elements and tables may
# be nested.
table = value.get('table')
for row in table.get('tableRows'):
cells = row.get('tableCells')
for cell in cells:
text += read_structural_elements(cell.get('content'))
elif 'tableOfContents' in value:
# The text in the TOC is also in a Structural Element.
toc = value.get('tableOfContents')
text += read_structural_elements(toc.get('content'))
return text
def main():
"""Uses the Docs API to print out the text of a document."""
credentials = get_credentials()
http = credentials.authorize(Http())
docs_service = discovery.build(
'docs', 'v1', http=http, discoveryServiceUrl=DISCOVERY_DOC
)
# Fetch the document with all of the tabs populated, including any nested
# child tabs.
doc = (
docs_service.documents()
.get(documentId=DOCUMENT_ID, include_tabs_content=True)
.execute()
)
all_tabs = get_all_tabs(doc)
# Print the text from each tab in the document.
for tab in all_tabs:
# Get the DocumentTab from the generic Tab.
document_tab = tab.get('documentTab')
doc_content = document_tab.get('body').get('content')
print(read_structural_elements(doc_content))
if __name__ == '__main__':
main()