changeset 12:e00b5301263a

def convert pdf function
author anatofuz <anatofuz@cr.ie.u-ryukyu.ac.jp>
date Wed, 01 Apr 2020 09:31:19 +0900
parents e7d9f63d969c
children 989cfda07d71
files cmd_pdf.go go.mod go.sum pdf.go
diffstat 4 files changed, 60 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/cmd_pdf.go	Tue Mar 31 19:15:18 2020 +0900
+++ b/cmd_pdf.go	Wed Apr 01 09:31:19 2020 +0900
@@ -2,6 +2,7 @@
 
 import (
 	"context"
+	"fmt"
 	"io"
 )
 
@@ -16,5 +17,7 @@
 }
 
 func (cd *cmdPDF) run(ctx context.Context, argv []string, outStream, errStream io.Writer) error {
+	h, _ := convertStringFromPDF(&argv)
+	fmt.Println(h)
 	return nil
 }
--- a/go.mod	Tue Mar 31 19:15:18 2020 +0900
+++ b/go.mod	Wed Apr 01 09:31:19 2020 +0900
@@ -3,6 +3,8 @@
 go 1.14
 
 require (
+	github.com/ledongthuc/pdf v0.0.0-20200323191019-23c5852adbd2
 	github.com/pkg/errors v0.9.1
+	golang.org/x/text v0.3.2
 	golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543
 )
--- a/go.sum	Tue Mar 31 19:15:18 2020 +0900
+++ b/go.sum	Wed Apr 01 09:31:19 2020 +0900
@@ -1,4 +1,11 @@
+github.com/dcu/pdf v0.0.0-20190612170416-c8b299d05f89 h1:h8vPbGmLBQY15p63w4I9dGQWc6YtR3p+uG1j/T+GrF4=
+github.com/dcu/pdf v0.0.0-20190612170416-c8b299d05f89/go.mod h1:Y73szhmilZ/gaudo99AA26HG9ldrKYcVIyKBne65nMQ=
+github.com/ledongthuc/pdf v0.0.0-20200323191019-23c5852adbd2 h1:H9HhyvygtvWnn1R8ymra4vdIUOvDDlaPlX6mjoJ9UTY=
+github.com/ledongthuc/pdf v0.0.0-20200323191019-23c5852adbd2/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
 github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
 github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
+golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pdf.go	Wed Apr 01 09:31:19 2020 +0900
@@ -0,0 +1,48 @@
+package lectable
+
+import (
+	"bytes"
+	"regexp"
+	"strings"
+
+	"github.com/ledongthuc/pdf"
+	"github.com/pkg/errors"
+	"golang.org/x/text/unicode/norm"
+)
+
+type lectureFPDF struct {
+	id       string `json:id`
+	body     string `json:body`
+	isSelect bool   `json:isSelect`
+	day      string `json:day`
+	place    string `json:place`
+}
+
+var idRgex = regexp.MustCompile(`\d{9}`)
+
+type pdfPagesStr []string
+
+func convertStringFromPDF(pdfPaths *[]string) (*[]pdfPagesStr, error) {
+	var ppsr []pdfPagesStr
+	for _, apdf := range *pdfPaths {
+		file, reader, err := pdf.Open(apdf)
+		defer file.Close()
+		if err != nil {
+			return nil, errors.Wrap(err, "failed open pdf")
+		}
+
+		var buf bytes.Buffer
+		preader, err := reader.GetPlainText()
+		if err != nil {
+			return nil, errors.Wrap(err, "failed convert pdf")
+		}
+
+		buf.ReadFrom(preader)
+		planString := string(norm.NFKC.Bytes(buf.Bytes()))
+
+		planstrings := strings.Split(planString, "   ") // "   " is next pdf page
+		planstrings = planstrings[1:]                   // 最初はゴミ
+		ppsr = append(ppsr, planstrings)
+	}
+	return &ppsr, nil
+}